Vendor import of llvm release_30 branch r142614:

http://llvm.org/svn/llvm-project/llvm/branches/release_30@142614
author: dim <dim@FreeBSD.org> 2011-10-20 21:10:27 +0000
committer: dim <dim@FreeBSD.org> 2011-10-20 21:10:27 +0000
commit: 7b3392326c40c3c20697816acae597ba7b3144eb (patch)
tree: 2cbcf22585e99f8a87d12d5ff94f392c0d266819 /lib
parent: 1176aa52646fe641a4243a246aa7f960c708a274 (diff)
download: FreeBSD-src-7b3392326c40c3c20697816acae597ba7b3144eb.zip
FreeBSD-src-7b3392326c40c3c20697816acae597ba7b3144eb.tar.gz
849 files changed, 63160 insertions, 33915 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index c189a00..bd132c0 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -237,6 +237,19 @@ AliasAnalysis::Location AliasAnalysis::getLocation(const VAArgInst *VI) {
                   VI->getMetadata(LLVMContext::MD_tbaa));
 }
 
+AliasAnalysis::Location
+AliasAnalysis::getLocation(const AtomicCmpXchgInst *CXI) {
+  return Location(CXI->getPointerOperand(),
+                  getTypeStoreSize(CXI->getCompareOperand()->getType()),
+                  CXI->getMetadata(LLVMContext::MD_tbaa));
+}
+
+AliasAnalysis::Location
+AliasAnalysis::getLocation(const AtomicRMWInst *RMWI) {
+  return Location(RMWI->getPointerOperand(),
+                  getTypeStoreSize(RMWI->getValOperand()->getType()),
+                  RMWI->getMetadata(LLVMContext::MD_tbaa));
+}
 
 AliasAnalysis::Location 
 AliasAnalysis::getLocationForSource(const MemTransferInst *MTI) {
@@ -268,8 +281,8 @@ AliasAnalysis::getLocationForDest(const MemIntrinsic *MTI) {
 
 AliasAnalysis::ModRefResult
 AliasAnalysis::getModRefInfo(const LoadInst *L, const Location &Loc) {
-  // Be conservative in the face of volatile.
-  if (L->isVolatile())
+  // Be conservative in the face of volatile/atomic.
+  if (!L->isUnordered())
     return ModRef;
 
   // If the load address doesn't alias the given address, it doesn't read
@@ -283,8 +296,8 @@ AliasAnalysis::getModRefInfo(const LoadInst *L, const Location &Loc) {
 
 AliasAnalysis::ModRefResult
 AliasAnalysis::getModRefInfo(const StoreInst *S, const Location &Loc) {
-  // Be conservative in the face of volatile.
-  if (S->isVolatile())
+  // Be conservative in the face of volatile/atomic.
+  if (!S->isUnordered())
     return ModRef;
 
   // If the store address cannot alias the pointer in question, then the
@@ -317,6 +330,33 @@ AliasAnalysis::getModRefInfo(const VAArgInst *V, const Location &Loc) {
   return ModRef;
 }
 
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(const AtomicCmpXchgInst *CX, const Location &Loc) {
+  // Acquire/Release cmpxchg has properties that matter for arbitrary addresses.
+  if (CX->getOrdering() > Monotonic)
+    return ModRef;
+
+  // If the cmpxchg address does not alias the location, it does not access it.
+  if (!alias(getLocation(CX), Loc))
+    return NoModRef;
+
+  return ModRef;
+}
+
+AliasAnalysis::ModRefResult
+AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, const Location &Loc) {
+  // Acquire/Release atomicrmw has properties that matter for arbitrary addresses.
+  if (RMW->getOrdering() > Monotonic)
+    return ModRef;
+
+  // If the atomicrmw address does not alias the location, it does not access it.
+  if (!alias(getLocation(RMW), Loc))
+    return NoModRef;
+
+  return ModRef;
+}
+
+
 // AliasAnalysis destructor: DO NOT move this to the header file for
 // AliasAnalysis or else clients of the AliasAnalysis class may not depend on
 // the AliasAnalysis.o file in the current .a file, causing alias analysis
@@ -341,7 +381,7 @@ void AliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 /// getTypeStoreSize - Return the TargetData store size for the given type,
 /// if known, or a conservative value otherwise.
 ///
-uint64_t AliasAnalysis::getTypeStoreSize(const Type *Ty) {
+uint64_t AliasAnalysis::getTypeStoreSize(Type *Ty) {
   return TD ? TD->getTypeStoreSize(Ty) : UnknownSize;
 }
 
diff --git a/lib/Analysis/AliasAnalysisEvaluator.cpp b/lib/Analysis/AliasAnalysisEvaluator.cpp
index 1afc1b7..37271b9 100644
--- a/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -171,12 +171,12 @@ bool AAEval::runOnFunction(Function &F) {
   for (SetVector<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end();
        I1 != E; ++I1) {
     uint64_t I1Size = AliasAnalysis::UnknownSize;
-    const Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType();
+    Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType();
     if (I1ElTy->isSized()) I1Size = AA.getTypeStoreSize(I1ElTy);
 
     for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) {
       uint64_t I2Size = AliasAnalysis::UnknownSize;
-      const Type *I2ElTy =cast<PointerType>((*I2)->getType())->getElementType();
+      Type *I2ElTy =cast<PointerType>((*I2)->getType())->getElementType();
       if (I2ElTy->isSized()) I2Size = AA.getTypeStoreSize(I2ElTy);
 
       switch (AA.alias(*I1, I1Size, *I2, I2Size)) {
@@ -207,7 +207,7 @@ bool AAEval::runOnFunction(Function &F) {
     for (SetVector<Value *>::iterator V = Pointers.begin(), Ve = Pointers.end();
          V != Ve; ++V) {
       uint64_t Size = AliasAnalysis::UnknownSize;
-      const Type *ElTy = cast<PointerType>((*V)->getType())->getElementType();
+      Type *ElTy = cast<PointerType>((*V)->getType())->getElementType();
       if (ElTy->isSized()) Size = AA.getTypeStoreSize(ElTy);
 
       switch (AA.getModRefInfo(*C, *V, Size)) {
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 2ed6949..3fcd3b5 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -56,12 +56,12 @@ void AliasSet::mergeSetIn(AliasSet &AS, AliasSetTracker &AST) {
       AliasTy = MayAlias;
   }
 
-  if (CallSites.empty()) {            // Merge call sites...
-    if (!AS.CallSites.empty())
-      std::swap(CallSites, AS.CallSites);
-  } else if (!AS.CallSites.empty()) {
-    CallSites.insert(CallSites.end(), AS.CallSites.begin(), AS.CallSites.end());
-    AS.CallSites.clear();
+  if (UnknownInsts.empty()) {            // Merge call sites...
+    if (!AS.UnknownInsts.empty())
+      std::swap(UnknownInsts, AS.UnknownInsts);
+  } else if (!AS.UnknownInsts.empty()) {
+    UnknownInsts.insert(UnknownInsts.end(), AS.UnknownInsts.begin(), AS.UnknownInsts.end());
+    AS.UnknownInsts.clear();
   }
 
   AS.Forward = this;  // Forward across AS now...
@@ -123,13 +123,10 @@ void AliasSet::addPointer(AliasSetTracker &AST, PointerRec &Entry,
   addRef();               // Entry points to alias set.
 }
 
-void AliasSet::addCallSite(CallSite CS, AliasAnalysis &AA) {
-  CallSites.push_back(CS.getInstruction());
+void AliasSet::addUnknownInst(Instruction *I, AliasAnalysis &AA) {
+  UnknownInsts.push_back(I);
 
-  AliasAnalysis::ModRefBehavior Behavior = AA.getModRefBehavior(CS);
-  if (Behavior == AliasAnalysis::DoesNotAccessMemory)
-    return;
-  if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+  if (!I->mayWriteToMemory()) {
     AliasTy = MayAlias;
     AccessTy |= Refs;
     return;
@@ -147,7 +144,7 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
                               const MDNode *TBAAInfo,
                               AliasAnalysis &AA) const {
   if (AliasTy == MustAlias) {
-    assert(CallSites.empty() && "Illegal must alias set!");
+    assert(UnknownInsts.empty() && "Illegal must alias set!");
 
     // If this is a set of MustAliases, only check to see if the pointer aliases
     // SOME value in the set.
@@ -167,10 +164,10 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
                                          I.getTBAAInfo())))
       return true;
 
-  // Check the call sites list and invoke list...
-  if (!CallSites.empty()) {
-    for (unsigned i = 0, e = CallSites.size(); i != e; ++i)
-      if (AA.getModRefInfo(CallSites[i],
+  // Check the unknown instructions...
+  if (!UnknownInsts.empty()) {
+    for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i)
+      if (AA.getModRefInfo(UnknownInsts[i],
                            AliasAnalysis::Location(Ptr, Size, TBAAInfo)) !=
             AliasAnalysis::NoModRef)
         return true;
@@ -179,18 +176,20 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
   return false;
 }
 
-bool AliasSet::aliasesCallSite(CallSite CS, AliasAnalysis &AA) const {
-  if (AA.doesNotAccessMemory(CS))
+bool AliasSet::aliasesUnknownInst(Instruction *Inst, AliasAnalysis &AA) const {
+  if (!Inst->mayReadOrWriteMemory())
     return false;
 
-  for (unsigned i = 0, e = CallSites.size(); i != e; ++i) {
-    if (AA.getModRefInfo(getCallSite(i), CS) != AliasAnalysis::NoModRef ||
-        AA.getModRefInfo(CS, getCallSite(i)) != AliasAnalysis::NoModRef)
+  for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
+    CallSite C1 = getUnknownInst(i), C2 = Inst;
+    if (!C1 || !C2 ||
+        AA.getModRefInfo(C1, C2) != AliasAnalysis::NoModRef ||
+        AA.getModRefInfo(C2, C1) != AliasAnalysis::NoModRef)
       return true;
   }
 
   for (iterator I = begin(), E = end(); I != E; ++I)
-    if (AA.getModRefInfo(CS, I.getPointer(), I.getSize()) !=
+    if (AA.getModRefInfo(Inst, I.getPointer(), I.getSize()) !=
            AliasAnalysis::NoModRef)
       return true;
 
@@ -244,10 +243,10 @@ bool AliasSetTracker::containsPointer(Value *Ptr, uint64_t Size,
 
 
 
-AliasSet *AliasSetTracker::findAliasSetForCallSite(CallSite CS) {
+AliasSet *AliasSetTracker::findAliasSetForUnknownInst(Instruction *Inst) {
   AliasSet *FoundSet = 0;
   for (iterator I = begin(), E = end(); I != E; ++I) {
-    if (I->Forward || !I->aliasesCallSite(CS, AA))
+    if (I->Forward || !I->aliasesUnknownInst(Inst, AA))
       continue;
     
     if (FoundSet == 0)        // If this is the first alias set ptr can go into.
@@ -296,22 +295,28 @@ bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
 
 
 bool AliasSetTracker::add(LoadInst *LI) {
+  if (LI->getOrdering() > Monotonic) return addUnknown(LI);
+  AliasSet::AccessType ATy = AliasSet::Refs;
+  if (!LI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   AliasSet &AS = addPointer(LI->getOperand(0),
                             AA.getTypeStoreSize(LI->getType()),
                             LI->getMetadata(LLVMContext::MD_tbaa),
-                            AliasSet::Refs, NewPtr);
+                            ATy, NewPtr);
   if (LI->isVolatile()) AS.setVolatile();
   return NewPtr;
 }
 
 bool AliasSetTracker::add(StoreInst *SI) {
+  if (SI->getOrdering() > Monotonic) return addUnknown(SI);
+  AliasSet::AccessType ATy = AliasSet::Mods;
+  if (!SI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   Value *Val = SI->getOperand(0);
   AliasSet &AS = addPointer(SI->getOperand(1),
                             AA.getTypeStoreSize(Val->getType()),
                             SI->getMetadata(LLVMContext::MD_tbaa),
-                            AliasSet::Mods, NewPtr);
+                            ATy, NewPtr);
   if (SI->isVolatile()) AS.setVolatile();
   return NewPtr;
 }
@@ -325,20 +330,20 @@ bool AliasSetTracker::add(VAArgInst *VAAI) {
 }
 
 
-bool AliasSetTracker::add(CallSite CS) {
-  if (isa<DbgInfoIntrinsic>(CS.getInstruction())) 
+bool AliasSetTracker::addUnknown(Instruction *Inst) {
+  if (isa<DbgInfoIntrinsic>(Inst)) 
     return true; // Ignore DbgInfo Intrinsics.
-  if (AA.doesNotAccessMemory(CS))
+  if (!Inst->mayReadOrWriteMemory())
     return true; // doesn't alias anything
 
-  AliasSet *AS = findAliasSetForCallSite(CS);
+  AliasSet *AS = findAliasSetForUnknownInst(Inst);
   if (AS) {
-    AS->addCallSite(CS, AA);
+    AS->addUnknownInst(Inst, AA);
     return false;
   }
   AliasSets.push_back(new AliasSet());
   AS = &AliasSets.back();
-  AS->addCallSite(CS, AA);
+  AS->addUnknownInst(Inst, AA);
   return true;
 }
 
@@ -348,13 +353,9 @@ bool AliasSetTracker::add(Instruction *I) {
     return add(LI);
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return add(SI);
-  if (CallInst *CI = dyn_cast<CallInst>(I))
-    return add(CI);
-  if (InvokeInst *II = dyn_cast<InvokeInst>(I))
-    return add(II);
   if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
     return add(VAAI);
-  return true;
+  return addUnknown(I);
 }
 
 void AliasSetTracker::add(BasicBlock &BB) {
@@ -375,8 +376,8 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
     AliasSet &AS = const_cast<AliasSet&>(*I);
 
     // If there are any call sites in the alias set, add them to this AST.
-    for (unsigned i = 0, e = AS.CallSites.size(); i != e; ++i)
-      add(AS.CallSites[i]);
+    for (unsigned i = 0, e = AS.UnknownInsts.size(); i != e; ++i)
+      add(AS.UnknownInsts[i]);
 
     // Loop over all of the pointers in this alias set.
     bool X;
@@ -393,7 +394,7 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 /// tracker.
 void AliasSetTracker::remove(AliasSet &AS) {
   // Drop all call sites.
-  AS.CallSites.clear();
+  AS.UnknownInsts.clear();
   
   // Clear the alias set.
   unsigned NumRefs = 0;
@@ -453,11 +454,11 @@ bool AliasSetTracker::remove(VAArgInst *VAAI) {
   return true;
 }
 
-bool AliasSetTracker::remove(CallSite CS) {
-  if (AA.doesNotAccessMemory(CS))
+bool AliasSetTracker::removeUnknown(Instruction *I) {
+  if (!I->mayReadOrWriteMemory())
     return false; // doesn't alias anything
 
-  AliasSet *AS = findAliasSetForCallSite(CS);
+  AliasSet *AS = findAliasSetForUnknownInst(I);
   if (!AS) return false;
   remove(*AS);
   return true;
@@ -469,11 +470,9 @@ bool AliasSetTracker::remove(Instruction *I) {
     return remove(LI);
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return remove(SI);
-  if (CallInst *CI = dyn_cast<CallInst>(I))
-    return remove(CI);
   if (VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
     return remove(VAAI);
-  return true;
+  return removeUnknown(I);
 }
 
 
@@ -488,13 +487,13 @@ void AliasSetTracker::deleteValue(Value *PtrVal) {
 
   // If this is a call instruction, remove the callsite from the appropriate
   // AliasSet (if present).
-  if (CallSite CS = PtrVal) {
-    if (!AA.doesNotAccessMemory(CS)) {
+  if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) {
+    if (Inst->mayReadOrWriteMemory()) {
       // Scan all the alias sets to see if this call site is contained.
       for (iterator I = begin(), E = end(); I != E; ++I) {
         if (I->Forward) continue;
         
-        I->removeCallSite(CS);
+        I->removeUnknownInst(Inst);
       }
     }
   }
@@ -571,11 +570,11 @@ void AliasSet::print(raw_ostream &OS) const {
       OS << ", " << I.getSize() << ")";
     }
   }
-  if (!CallSites.empty()) {
-    OS << "\n    " << CallSites.size() << " Call Sites: ";
-    for (unsigned i = 0, e = CallSites.size(); i != e; ++i) {
+  if (!UnknownInsts.empty()) {
+    OS << "\n    " << UnknownInsts.size() << " Unknown instructions: ";
+    for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
       if (i) OS << ", ";
-      WriteAsOperand(OS, CallSites[i]);
+      WriteAsOperand(OS, UnknownInsts[i]);
     }
   }
   OS << "\n";
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 71e0a83..0ba6af9 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm-c/Analysis.h"
+#include "llvm-c/Initialization.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Analysis/Verifier.h"
 #include <cstring>
@@ -23,7 +24,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeAliasSetPrinterPass(Registry);
   initializeNoAAPass(Registry);
   initializeBasicAliasAnalysisPass(Registry);
-  initializeBlockFrequencyPass(Registry);
+  initializeBlockFrequencyInfoPass(Registry);
   initializeBranchProbabilityInfoPass(Registry);
   initializeCFGViewerPass(Registry);
   initializeCFGPrinterPass(Registry);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 8330ea7..af400ba 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -100,7 +101,7 @@ static bool isEscapeSource(const Value *V) {
 /// getObjectSize - Return the size of the object specified by V, or
 /// UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const TargetData &TD) {
-  const Type *AccessTy;
+  Type *AccessTy;
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
     if (!GV->hasDefinitiveInitializer())
       return AliasAnalysis::UnknownSize;
@@ -317,7 +318,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
          E = GEPOp->op_end(); I != E; ++I) {
       Value *Index = *I;
       // Compute the (potentially symbolic) offset in bytes for this index.
-      if (const StructType *STy = dyn_cast<StructType>(*GTI++)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
         // For a struct, add the member offset.
         unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
         if (FieldNo == 0) continue;
@@ -374,7 +375,8 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       }
       
       if (Scale) {
-        VariableGEPIndex Entry = {Index, Extension, Scale};
+        VariableGEPIndex Entry = {Index, Extension,
+                                  static_cast<int64_t>(Scale)};
         VarIndices.push_back(Entry);
       }
     }
@@ -467,6 +469,7 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
     }
 
     virtual AliasResult alias(const Location &LocA,
@@ -549,10 +552,15 @@ namespace {
 
 // Register this pass...
 char BasicAliasAnalysis::ID = 0;
-INITIALIZE_AG_PASS(BasicAliasAnalysis, AliasAnalysis, "basicaa",
+INITIALIZE_AG_PASS_BEGIN(BasicAliasAnalysis, AliasAnalysis, "basicaa",
+                   "Basic Alias Analysis (stateless AA impl)",
+                   false, true, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_AG_PASS_END(BasicAliasAnalysis, AliasAnalysis, "basicaa",
                    "Basic Alias Analysis (stateless AA impl)",
                    false, true, false)
 
+
 ImmutablePass *llvm::createBasicAliasAnalysisPass() {
   return new BasicAliasAnalysis();
 }
@@ -706,7 +714,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       // is impossible to alias the pointer we're checking.  If not, we have to
       // assume that the call could touch the pointer, even though it doesn't
       // escape.
-      if (!isNoAlias(Location(cast<Value>(CI)), Loc)) {
+      if (!isNoAlias(Location(*CI), Location(Object))) {
         PassedAsArg = true;
         break;
       }
@@ -716,6 +724,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       return NoModRef;
   }
 
+  const TargetLibraryInfo &TLI = getAnalysis<TargetLibraryInfo>();
   ModRefResult Min = ModRef;
 
   // Finally, handle specific knowledge of intrinsics.
@@ -754,26 +763,6 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       // We know that memset doesn't load anything.
       Min = Mod;
       break;
-    case Intrinsic::atomic_cmp_swap:
-    case Intrinsic::atomic_swap:
-    case Intrinsic::atomic_load_add:
-    case Intrinsic::atomic_load_sub:
-    case Intrinsic::atomic_load_and:
-    case Intrinsic::atomic_load_nand:
-    case Intrinsic::atomic_load_or:
-    case Intrinsic::atomic_load_xor:
-    case Intrinsic::atomic_load_max:
-    case Intrinsic::atomic_load_min:
-    case Intrinsic::atomic_load_umax:
-    case Intrinsic::atomic_load_umin:
-      if (TD) {
-        Value *Op1 = II->getArgOperand(0);
-        uint64_t Op1Size = TD->getTypeStoreSize(Op1->getType());
-        MDNode *Tag = II->getMetadata(LLVMContext::MD_tbaa);
-        if (isNoAlias(Location(Op1, Op1Size, Tag), Loc))
-          return NoModRef;
-      }
-      break;
     case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start: {
@@ -818,6 +807,39 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     }
     }
 
+  // We can bound the aliasing properties of memset_pattern16 just as we can
+  // for memcpy/memset.  This is particularly important because the 
+  // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
+  // whenever possible.
+  else if (TLI.has(LibFunc::memset_pattern16) &&
+           CS.getCalledFunction() &&
+           CS.getCalledFunction()->getName() == "memset_pattern16") {
+    const Function *MS = CS.getCalledFunction();
+    FunctionType *MemsetType = MS->getFunctionType();
+    if (!MemsetType->isVarArg() && MemsetType->getNumParams() == 3 &&
+        isa<PointerType>(MemsetType->getParamType(0)) &&
+        isa<PointerType>(MemsetType->getParamType(1)) &&
+        isa<IntegerType>(MemsetType->getParamType(2))) {
+      uint64_t Len = UnknownSize;
+      if (const ConstantInt *LenCI = dyn_cast<ConstantInt>(CS.getArgument(2)))
+        Len = LenCI->getZExtValue();
+      const Value *Dest = CS.getArgument(0);
+      const Value *Src = CS.getArgument(1);
+      // If it can't overlap the source dest, then it doesn't modref the loc.
+      if (isNoAlias(Location(Dest, Len), Loc)) {
+        // Always reads 16 bytes of the source.
+        if (isNoAlias(Location(Src, 16), Loc))
+          return NoModRef;
+        // If it can't overlap the dest, then worst case it reads the loc.
+        Min = Ref;
+      // Always reads 16 bytes of the source.
+      } else if (isNoAlias(Location(Src, 16), Loc)) {
+        // If it can't overlap the source, then worst case it mutates the loc.
+        Min = Mod;
+      }
+    }
+  }
+
   // The AliasAnalysis base class has some smarts, lets use them.
   return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min);
 }
@@ -913,43 +935,43 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
   if (GEP1BaseOffset == 0 && GEP1VariableIndices.empty())
     return MustAlias;
 
-  // If there is a difference between the pointers, but the difference is
-  // less than the size of the associated memory object, then we know
-  // that the objects are partially overlapping.
+  // If there is a constant difference between the pointers, but the difference
+  // is less than the size of the associated memory object, then we know
+  // that the objects are partially overlapping.  If the difference is
+  // greater, we know they do not overlap.
   if (GEP1BaseOffset != 0 && GEP1VariableIndices.empty()) {
-    if (GEP1BaseOffset >= 0 ?
-        (V2Size != UnknownSize && (uint64_t)GEP1BaseOffset < V2Size) :
-        (V1Size != UnknownSize && -(uint64_t)GEP1BaseOffset < V1Size &&
-         GEP1BaseOffset != INT64_MIN))
-      return PartialAlias;
+    if (GEP1BaseOffset >= 0) {
+      if (V2Size != UnknownSize) {
+        if ((uint64_t)GEP1BaseOffset < V2Size)
+          return PartialAlias;
+        return NoAlias;
+      }
+    } else {
+      if (V1Size != UnknownSize) {
+        if (-(uint64_t)GEP1BaseOffset < V1Size)
+          return PartialAlias;
+        return NoAlias;
+      }
+    }
   }
 
-  // If we have a known constant offset, see if this offset is larger than the
-  // access size being queried.  If so, and if no variable indices can remove
-  // pieces of this constant, then we know we have a no-alias.  For example,
-  //   &A[100] != &A.
-  
-  // In order to handle cases like &A[100][i] where i is an out of range
-  // subscript, we have to ignore all constant offset pieces that are a multiple
-  // of a scaled index.  Do this by removing constant offsets that are a
-  // multiple of any of our variable indices.  This allows us to transform
-  // things like &A[i][1] because i has a stride of (e.g.) 8 bytes but the 1
-  // provides an offset of 4 bytes (assuming a <= 4 byte access).
-  for (unsigned i = 0, e = GEP1VariableIndices.size();
-       i != e && GEP1BaseOffset;++i)
-    if (int64_t RemovedOffset = GEP1BaseOffset/GEP1VariableIndices[i].Scale)
-      GEP1BaseOffset -= RemovedOffset*GEP1VariableIndices[i].Scale;
-  
-  // If our known offset is bigger than the access size, we know we don't have
-  // an alias.
-  if (GEP1BaseOffset) {
-    if (GEP1BaseOffset >= 0 ?
-        (V2Size != UnknownSize && (uint64_t)GEP1BaseOffset >= V2Size) :
-        (V1Size != UnknownSize && -(uint64_t)GEP1BaseOffset >= V1Size &&
-         GEP1BaseOffset != INT64_MIN))
+  // Try to distinguish something like &A[i][1] against &A[42][0].
+  // Grab the least significant bit set in any of the scales.
+  if (!GEP1VariableIndices.empty()) {
+    uint64_t Modulo = 0;
+    for (unsigned i = 0, e = GEP1VariableIndices.size(); i != e; ++i)
+      Modulo |= (uint64_t)GEP1VariableIndices[i].Scale;
+    Modulo = Modulo ^ (Modulo & (Modulo - 1));
+
+    // We can compute the difference between the two addresses
+    // mod Modulo. Check whether that difference guarantees that the
+    // two locations do not alias.
+    uint64_t ModOffset = (uint64_t)GEP1BaseOffset & (Modulo - 1);
+    if (V1Size != UnknownSize && V2Size != UnknownSize &&
+        ModOffset >= V2Size && V1Size <= Modulo - ModOffset)
       return NoAlias;
   }
-  
+
   // Statically, we can see that the base objects are the same, but the
   // pointers have dynamic offsets which we can't resolve. And none of our
   // little tricks above worked.
diff --git a/lib/Analysis/BlockFrequency.cpp b/lib/Analysis/BlockFrequency.cpp
deleted file mode 100644
index 4b86d1d..0000000
--- a/lib/Analysis/BlockFrequency.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//=======-------- BlockFrequency.cpp - Block Frequency Analysis -------=======//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Loops should be simplified before this analysis.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/InitializePasses.h"
-#include "llvm/Analysis/BlockFrequencyImpl.h"
-#include "llvm/Analysis/BlockFrequency.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-
-using namespace llvm;
-
-INITIALIZE_PASS_BEGIN(BlockFrequency, "block-freq", "Block Frequency Analysis",
-                      true, true)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
-INITIALIZE_PASS_END(BlockFrequency, "block-freq", "Block Frequency Analysis",
-                    true, true)
-
-char BlockFrequency::ID = 0;
-
-
-BlockFrequency::BlockFrequency() : FunctionPass(ID) {
-  initializeBlockFrequencyPass(*PassRegistry::getPassRegistry());
-  BFI = new BlockFrequencyImpl<BasicBlock, Function, BranchProbabilityInfo>();
-}
-
-BlockFrequency::~BlockFrequency() {
-  delete BFI;
-}
-
-void BlockFrequency::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<BranchProbabilityInfo>();
-  AU.setPreservesAll();
-}
-
-bool BlockFrequency::runOnFunction(Function &F) {
-  BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
-  BFI->doFunction(&F, &BPI);
-  return false;
-}
-
-/// getblockFreq - Return block frequency. Never return 0, value must be
-/// positive. Please note that initial frequency is equal to 1024. It means that
-/// we should not rely on the value itself, but only on the comparison to the
-/// other block frequencies. We do this to avoid using of floating points.
-///
-uint32_t BlockFrequency::getBlockFreq(BasicBlock *BB) {
-  return BFI->getBlockFreq(BB);
-}
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
new file mode 100644
index 0000000..d16665f
--- /dev/null
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -0,0 +1,63 @@
+//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------=======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Analysis/BlockFrequencyImpl.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
+                      true, true)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
+INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
+                    true, true)
+
+char BlockFrequencyInfo::ID = 0;
+
+
+BlockFrequencyInfo::BlockFrequencyInfo() : FunctionPass(ID) {
+  initializeBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
+  BFI = new BlockFrequencyImpl<BasicBlock, Function, BranchProbabilityInfo>();
+}
+
+BlockFrequencyInfo::~BlockFrequencyInfo() {
+  delete BFI;
+}
+
+void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<BranchProbabilityInfo>();
+  AU.setPreservesAll();
+}
+
+bool BlockFrequencyInfo::runOnFunction(Function &F) {
+  BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
+  BFI->doFunction(&F, &BPI);
+  return false;
+}
+
+void BlockFrequencyInfo::print(raw_ostream &O, const Module *) const {
+  if (BFI) BFI->print(O);
+}
+
+/// getblockFreq - Return block frequency. Return 0 if we don't have the
+/// information. Please note that initial frequency is equal to 1024. It means
+/// that we should not rely on the value itself, but only on the comparison to
+/// the other block frequencies. We do this to avoid using of floating points.
+///
+BlockFrequency BlockFrequencyInfo::getBlockFreq(BasicBlock *BB) const {
+  return BFI->getBlockFreq(BB);
+}
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index e39cd22..bde3b76 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -11,7 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Constants.h"
 #include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Support/Debug.h"
@@ -33,7 +36,7 @@ namespace {
 // private methods are hidden in the .cpp file.
 class BranchProbabilityAnalysis {
 
-  typedef std::pair<BasicBlock *, BasicBlock *> Edge;
+  typedef std::pair<const BasicBlock *, const BasicBlock *> Edge;
 
   DenseMap<Edge, uint32_t> *Weights;
 
@@ -52,7 +55,7 @@ class BranchProbabilityAnalysis {
   //          V
   //         BB1<-+
   //          |   |
-  //          |   | (Weight = 128)
+  //          |   | (Weight = 124)
   //          V   |
   //         BB2--+
   //          |
@@ -60,12 +63,21 @@ class BranchProbabilityAnalysis {
   //          V
   //         BB3
   //
-  // Probability of the edge BB2->BB1 = 128 / (128 + 4) = 0.9696..
-  // Probability of the edge BB2->BB3 = 4 / (128 + 4) = 0.0303..
+  // Probability of the edge BB2->BB1 = 124 / (124 + 4) = 0.96875
+  // Probability of the edge BB2->BB3 = 4 / (124 + 4) = 0.03125
 
-  static const uint32_t LBH_TAKEN_WEIGHT = 128;
+  static const uint32_t LBH_TAKEN_WEIGHT = 124;
   static const uint32_t LBH_NONTAKEN_WEIGHT = 4;
 
+  static const uint32_t RH_TAKEN_WEIGHT = 24;
+  static const uint32_t RH_NONTAKEN_WEIGHT = 8;
+
+  static const uint32_t PH_TAKEN_WEIGHT = 20;
+  static const uint32_t PH_NONTAKEN_WEIGHT = 12;
+
+  static const uint32_t ZH_TAKEN_WEIGHT = 20;
+  static const uint32_t ZH_NONTAKEN_WEIGHT = 12;
+
   // Standard weight value. Used when none of the heuristics set weight for
   // the edge.
   static const uint32_t NORMAL_WEIGHT = 16;
@@ -100,29 +112,6 @@ class BranchProbabilityAnalysis {
     return false;
   }
 
-  // Multiply Edge Weight by two.
-  void incEdgeWeight(BasicBlock *Src, BasicBlock *Dst) {
-    uint32_t Weight = BP->getEdgeWeight(Src, Dst);
-    uint32_t MaxWeight = getMaxWeightFor(Src);
-
-    if (Weight * 2 > MaxWeight)
-      BP->setEdgeWeight(Src, Dst, MaxWeight);
-    else
-      BP->setEdgeWeight(Src, Dst, Weight * 2);
-  }
-
-  // Divide Edge Weight by two.
-  void decEdgeWeight(BasicBlock *Src, BasicBlock *Dst) {
-    uint32_t Weight = BP->getEdgeWeight(Src, Dst);
-
-    assert(Weight > 0);
-    if (Weight / 2 < MIN_WEIGHT)
-      BP->setEdgeWeight(Src, Dst, MIN_WEIGHT);
-    else
-      BP->setEdgeWeight(Src, Dst, Weight / 2);
-  }
-
-
   uint32_t getMaxWeightFor(BasicBlock *BB) const {
     return UINT32_MAX / BB->getTerminator()->getNumSuccessors();
   }
@@ -133,49 +122,119 @@ public:
     : Weights(W), BP(BP), LI(LI) {
   }
 
+  // Metadata Weights
+  bool calcMetadataWeights(BasicBlock *BB);
+
   // Return Heuristics
-  void calcReturnHeuristics(BasicBlock *BB);
+  bool calcReturnHeuristics(BasicBlock *BB);
 
   // Pointer Heuristics
-  void calcPointerHeuristics(BasicBlock *BB);
+  bool calcPointerHeuristics(BasicBlock *BB);
 
   // Loop Branch Heuristics
-  void calcLoopBranchHeuristics(BasicBlock *BB);
+  bool calcLoopBranchHeuristics(BasicBlock *BB);
+
+  // Zero Heurestics
+  bool calcZeroHeuristics(BasicBlock *BB);
 
   bool runOnFunction(Function &F);
 };
 } // end anonymous namespace
 
+// Propagate existing explicit probabilities from either profile data or
+// 'expect' intrinsic processing.
+bool BranchProbabilityAnalysis::calcMetadataWeights(BasicBlock *BB) {
+  TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 1)
+    return false;
+  if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+    return false;
+
+  MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
+  if (!WeightsNode)
+    return false;
+
+  // Ensure there are weights for all of the successors. Note that the first
+  // operand to the metadata node is a name, not a weight.
+  if (WeightsNode->getNumOperands() != TI->getNumSuccessors() + 1)
+    return false;
+
+  // Build up the final weights that will be used in a temporary buffer, but
+  // don't add them until all weihts are present. Each weight value is clamped
+  // to [1, getMaxWeightFor(BB)].
+  uint32_t WeightLimit = getMaxWeightFor(BB);
+  SmallVector<uint32_t, 2> Weights;
+  Weights.reserve(TI->getNumSuccessors());
+  for (unsigned i = 1, e = WeightsNode->getNumOperands(); i != e; ++i) {
+    ConstantInt *Weight = dyn_cast<ConstantInt>(WeightsNode->getOperand(i));
+    if (!Weight)
+      return false;
+    Weights.push_back(
+      std::max<uint32_t>(1, Weight->getLimitedValue(WeightLimit)));
+  }
+  assert(Weights.size() == TI->getNumSuccessors() && "Checked above");
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    BP->setEdgeWeight(BB, TI->getSuccessor(i), Weights[i]);
+
+  return true;
+}
+
 // Calculate Edge Weights using "Return Heuristics". Predict a successor which
 // leads directly to Return Instruction will not be taken.
-void BranchProbabilityAnalysis::calcReturnHeuristics(BasicBlock *BB){
+bool BranchProbabilityAnalysis::calcReturnHeuristics(BasicBlock *BB){
   if (BB->getTerminator()->getNumSuccessors() == 1)
-    return;
+    return false;
+
+  SmallPtrSet<BasicBlock *, 4> ReturningEdges;
+  SmallPtrSet<BasicBlock *, 4> StayEdges;
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     BasicBlock *Succ = *I;
-    if (isReturningBlock(Succ)) {
-      decEdgeWeight(BB, Succ);
+    if (isReturningBlock(Succ))
+      ReturningEdges.insert(Succ);
+    else
+      StayEdges.insert(Succ);
+  }
+
+  if (uint32_t numStayEdges = StayEdges.size()) {
+    uint32_t stayWeight = RH_TAKEN_WEIGHT / numStayEdges;
+    if (stayWeight < NORMAL_WEIGHT)
+      stayWeight = NORMAL_WEIGHT;
+
+    for (SmallPtrSet<BasicBlock *, 4>::iterator I = StayEdges.begin(),
+         E = StayEdges.end(); I != E; ++I)
+      BP->setEdgeWeight(BB, *I, stayWeight);
+  }
+
+  if (uint32_t numRetEdges = ReturningEdges.size()) {
+    uint32_t retWeight = RH_NONTAKEN_WEIGHT / numRetEdges;
+    if (retWeight < MIN_WEIGHT)
+      retWeight = MIN_WEIGHT;
+    for (SmallPtrSet<BasicBlock *, 4>::iterator I = ReturningEdges.begin(),
+         E = ReturningEdges.end(); I != E; ++I) {
+      BP->setEdgeWeight(BB, *I, retWeight);
     }
   }
+
+  return ReturningEdges.size() > 0;
 }
 
 // Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion
 // between two pointer or pointer and NULL will fail.
-void BranchProbabilityAnalysis::calcPointerHeuristics(BasicBlock *BB) {
+bool BranchProbabilityAnalysis::calcPointerHeuristics(BasicBlock *BB) {
   BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
   if (!BI || !BI->isConditional())
-    return;
+    return false;
 
   Value *Cond = BI->getCondition();
   ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
   if (!CI || !CI->isEquality())
-    return;
+    return false;
 
   Value *LHS = CI->getOperand(0);
 
   if (!LHS->getType()->isPointerTy())
-    return;
+    return false;
 
   assert(CI->getOperand(1)->getType()->isPointerTy());
 
@@ -190,29 +249,35 @@ void BranchProbabilityAnalysis::calcPointerHeuristics(BasicBlock *BB) {
   if (!isProb)
     std::swap(Taken, NonTaken);
 
-  incEdgeWeight(BB, Taken);
-  decEdgeWeight(BB, NonTaken);
+  BP->setEdgeWeight(BB, Taken, PH_TAKEN_WEIGHT);
+  BP->setEdgeWeight(BB, NonTaken, PH_NONTAKEN_WEIGHT);
+  return true;
 }
 
 // Calculate Edge Weights using "Loop Branch Heuristics". Predict backedges
 // as taken, exiting edges as not-taken.
-void BranchProbabilityAnalysis::calcLoopBranchHeuristics(BasicBlock *BB) {
+bool BranchProbabilityAnalysis::calcLoopBranchHeuristics(BasicBlock *BB) {
   uint32_t numSuccs = BB->getTerminator()->getNumSuccessors();
 
   Loop *L = LI->getLoopFor(BB);
   if (!L)
-    return;
+    return false;
+
+  SmallPtrSet<BasicBlock *, 8> BackEdges;
+  SmallPtrSet<BasicBlock *, 8> ExitingEdges;
+  SmallPtrSet<BasicBlock *, 8> InEdges; // Edges from header to the loop.
 
-  SmallVector<BasicBlock *, 8> BackEdges;
-  SmallVector<BasicBlock *, 8> ExitingEdges;
+  bool isHeader = BB == L->getHeader();
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     BasicBlock *Succ = *I;
     Loop *SuccL = LI->getLoopFor(Succ);
     if (SuccL != L)
-      ExitingEdges.push_back(Succ);
+      ExitingEdges.insert(Succ);
     else if (Succ == L->getHeader())
-      BackEdges.push_back(Succ);
+      BackEdges.insert(Succ);
+    else if (isHeader)
+      InEdges.insert(Succ);
   }
 
   if (uint32_t numBackEdges = BackEdges.size()) {
@@ -220,39 +285,121 @@ void BranchProbabilityAnalysis::calcLoopBranchHeuristics(BasicBlock *BB) {
     if (backWeight < NORMAL_WEIGHT)
       backWeight = NORMAL_WEIGHT;
 
-    for (SmallVector<BasicBlock *, 8>::iterator EI = BackEdges.begin(),
+    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = BackEdges.begin(),
          EE = BackEdges.end(); EI != EE; ++EI) {
       BasicBlock *Back = *EI;
       BP->setEdgeWeight(BB, Back, backWeight);
     }
   }
 
+  if (uint32_t numInEdges = InEdges.size()) {
+    uint32_t inWeight = LBH_TAKEN_WEIGHT / numInEdges;
+    if (inWeight < NORMAL_WEIGHT)
+      inWeight = NORMAL_WEIGHT;
+
+    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = InEdges.begin(),
+         EE = InEdges.end(); EI != EE; ++EI) {
+      BasicBlock *Back = *EI;
+      BP->setEdgeWeight(BB, Back, inWeight);
+    }
+  }
+
   uint32_t numExitingEdges = ExitingEdges.size();
   if (uint32_t numNonExitingEdges = numSuccs - numExitingEdges) {
     uint32_t exitWeight = LBH_NONTAKEN_WEIGHT / numNonExitingEdges;
     if (exitWeight < MIN_WEIGHT)
       exitWeight = MIN_WEIGHT;
 
-    for (SmallVector<BasicBlock *, 8>::iterator EI = ExitingEdges.begin(),
+    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = ExitingEdges.begin(),
          EE = ExitingEdges.end(); EI != EE; ++EI) {
       BasicBlock *Exiting = *EI;
       BP->setEdgeWeight(BB, Exiting, exitWeight);
     }
   }
+
+  return true;
 }
 
+bool BranchProbabilityAnalysis::calcZeroHeuristics(BasicBlock *BB) {
+  BranchInst * BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  Value *Cond = BI->getCondition();
+  ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+  if (!CI)
+    return false;
+
+  Value *RHS = CI->getOperand(1);
+  ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
+  if (!CV)
+    return false;
+
+  bool isProb;
+  if (CV->isZero()) {
+    switch (CI->getPredicate()) {
+    case CmpInst::ICMP_EQ:
+      // X == 0   ->  Unlikely
+      isProb = false;
+      break;
+    case CmpInst::ICMP_NE:
+      // X != 0   ->  Likely
+      isProb = true;
+      break;
+    case CmpInst::ICMP_SLT:
+      // X < 0   ->  Unlikely
+      isProb = false;
+      break;
+    case CmpInst::ICMP_SGT:
+      // X > 0   ->  Likely
+      isProb = true;
+      break;
+    default:
+      return false;
+    }
+  } else if (CV->isOne() && CI->getPredicate() == CmpInst::ICMP_SLT) {
+    // InstCombine canonicalizes X <= 0 into X < 1.
+    // X <= 0   ->  Unlikely
+    isProb = false;
+  } else if (CV->isAllOnesValue() && CI->getPredicate() == CmpInst::ICMP_SGT) {
+    // InstCombine canonicalizes X >= 0 into X > -1.
+    // X >= 0   ->  Likely
+    isProb = true;
+  } else {
+    return false;
+  }
+
+  BasicBlock *Taken = BI->getSuccessor(0);
+  BasicBlock *NonTaken = BI->getSuccessor(1);
+
+  if (!isProb)
+    std::swap(Taken, NonTaken);
+
+  BP->setEdgeWeight(BB, Taken, ZH_TAKEN_WEIGHT);
+  BP->setEdgeWeight(BB, NonTaken, ZH_NONTAKEN_WEIGHT);
+
+  return true;
+}
+
+
 bool BranchProbabilityAnalysis::runOnFunction(Function &F) {
 
   for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
     BasicBlock *BB = I++;
 
-    // Only LBH uses setEdgeWeight method.
-    calcLoopBranchHeuristics(BB);
+    if (calcMetadataWeights(BB))
+      continue;
+
+    if (calcLoopBranchHeuristics(BB))
+      continue;
 
-    // PH and RH use only incEdgeWeight and decEwdgeWeight methods to
-    // not efface LBH results.
-    calcPointerHeuristics(BB);
-    calcReturnHeuristics(BB);
+    if (calcReturnHeuristics(BB))
+      continue;
+
+    if (calcPointerHeuristics(BB))
+      continue;
+
+    calcZeroHeuristics(BB);
   }
 
   return false;
@@ -269,11 +416,11 @@ bool BranchProbabilityInfo::runOnFunction(Function &F) {
   return BPA.runOnFunction(F);
 }
 
-uint32_t BranchProbabilityInfo::getSumForBlock(BasicBlock *BB) const {
+uint32_t BranchProbabilityInfo::getSumForBlock(const BasicBlock *BB) const {
   uint32_t Sum = 0;
 
-  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    BasicBlock *Succ = *I;
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    const BasicBlock *Succ = *I;
     uint32_t Weight = getEdgeWeight(BB, Succ);
     uint32_t PrevSum = Sum;
 
@@ -284,7 +431,8 @@ uint32_t BranchProbabilityInfo::getSumForBlock(BasicBlock *BB) const {
   return Sum;
 }
 
-bool BranchProbabilityInfo::isEdgeHot(BasicBlock *Src, BasicBlock *Dst) const {
+bool BranchProbabilityInfo::
+isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
   uint32_t Weight = getEdgeWeight(Src, Dst);
   uint32_t Sum = getSumForBlock(Src);
@@ -321,8 +469,8 @@ BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
 }
 
 // Return edge's weight. If can't find it, return DEFAULT_WEIGHT value.
-uint32_t
-BranchProbabilityInfo::getEdgeWeight(BasicBlock *Src, BasicBlock *Dst) const {
+uint32_t BranchProbabilityInfo::
+getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
   Edge E(Src, Dst);
   DenseMap<Edge, uint32_t>::const_iterator I = Weights.find(E);
 
@@ -332,8 +480,8 @@ BranchProbabilityInfo::getEdgeWeight(BasicBlock *Src, BasicBlock *Dst) const {
   return DEFAULT_WEIGHT;
 }
 
-void BranchProbabilityInfo::setEdgeWeight(BasicBlock *Src, BasicBlock *Dst,
-                                     uint32_t Weight) {
+void BranchProbabilityInfo::
+setEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst, uint32_t Weight) {
   Weights[std::make_pair(Src, Dst)] = Weight;
   DEBUG(dbgs() << "set edge " << Src->getNameStr() << " -> "
                << Dst->getNameStr() << " weight to " << Weight
@@ -342,7 +490,7 @@ void BranchProbabilityInfo::setEdgeWeight(BasicBlock *Src, BasicBlock *Dst,
 
 
 BranchProbability BranchProbabilityInfo::
-getEdgeProbability(BasicBlock *Src, BasicBlock *Dst) const {
+getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
 
   uint32_t N = getEdgeWeight(Src, Dst);
   uint32_t D = getSumForBlock(Src);
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index ab846a2..e79459d 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -6,7 +6,7 @@ add_llvm_library(LLVMAnalysis
   AliasSetTracker.cpp
   Analysis.cpp
   BasicAliasAnalysis.cpp
-  BlockFrequency.cpp
+  BlockFrequencyInfo.cpp
   BranchProbabilityInfo.cpp
   CFGPrinter.cpp
   CaptureTracking.cpp
@@ -58,4 +58,10 @@ add_llvm_library(LLVMAnalysis
   ValueTracking.cpp
   )
 
+add_llvm_library_dependencies(LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(IPA)
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 7fca17e..df79849 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -43,11 +43,16 @@ using namespace llvm;
 /// FoldBitCast - Constant fold bitcast, symbolically evaluating it with 
 /// TargetData.  This always returns a non-null constant, but it may be a
 /// ConstantExpr if unfoldable.
-static Constant *FoldBitCast(Constant *C, const Type *DestTy,
+static Constant *FoldBitCast(Constant *C, Type *DestTy,
                              const TargetData &TD) {
-  
-  // This only handles casts to vectors currently.
-  const VectorType *DestVTy = dyn_cast<VectorType>(DestTy);
+  // Catch the obvious splat cases.
+  if (C->isNullValue() && !DestTy->isX86_MMXTy())
+    return Constant::getNullValue(DestTy);
+  if (C->isAllOnesValue() && !DestTy->isX86_MMXTy())
+    return Constant::getAllOnesValue(DestTy);
+
+  // The code below only handles casts to vectors currently.
+  VectorType *DestVTy = dyn_cast<VectorType>(DestTy);
   if (DestVTy == 0)
     return ConstantExpr::getBitCast(C, DestTy);
   
@@ -69,8 +74,8 @@ static Constant *FoldBitCast(Constant *C, const Type *DestTy,
   if (NumDstElt == NumSrcElt)
     return ConstantExpr::getBitCast(C, DestTy);
   
-  const Type *SrcEltTy = CV->getType()->getElementType();
-  const Type *DstEltTy = DestVTy->getElementType();
+  Type *SrcEltTy = CV->getType()->getElementType();
+  Type *DstEltTy = DestVTy->getElementType();
   
   // Otherwise, we're changing the number of elements in a vector, which 
   // requires endianness information to do the right thing.  For example,
@@ -85,7 +90,7 @@ static Constant *FoldBitCast(Constant *C, const Type *DestTy,
   if (DstEltTy->isFloatingPointTy()) {
     // Fold to an vector of integers with same size as our FP type.
     unsigned FPWidth = DstEltTy->getPrimitiveSizeInBits();
-    const Type *DestIVTy =
+    Type *DestIVTy =
       VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumDstElt);
     // Recursively handle this integer conversion, if possible.
     C = FoldBitCast(C, DestIVTy, TD);
@@ -99,7 +104,7 @@ static Constant *FoldBitCast(Constant *C, const Type *DestTy,
   // it to integer first.
   if (SrcEltTy->isFloatingPointTy()) {
     unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
-    const Type *SrcIVTy =
+    Type *SrcIVTy =
       VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElt);
     // Ask VMCore to do the conversion now that #elts line up.
     C = ConstantExpr::getBitCast(C, SrcIVTy);
@@ -212,11 +217,11 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
       if (!CI) return false;  // Index isn't a simple constant?
       if (CI->isZero()) continue;  // Not adding anything.
       
-      if (const StructType *ST = dyn_cast<StructType>(*GTI)) {
+      if (StructType *ST = dyn_cast<StructType>(*GTI)) {
         // N = N + Offset
         Offset += TD.getStructLayout(ST)->getElementOffset(CI->getZExtValue());
       } else {
-        const SequentialType *SQT = cast<SequentialType>(*GTI);
+        SequentialType *SQT = cast<SequentialType>(*GTI);
         Offset += TD.getTypeAllocSize(SQT->getElementType())*CI->getSExtValue();
       }
     }
@@ -354,8 +359,8 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
 static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                                                  const TargetData &TD) {
-  const Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
-  const IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
+  Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
+  IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
   
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
@@ -363,7 +368,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
     // and then bitcast the result.  This can be useful for union cases.  Note
     // that address spaces don't matter here since we're not going to result in
     // an actual new load.
-    const Type *MapTy;
+    Type *MapTy;
     if (LoadTy->isFloatTy())
       MapTy = Type::getInt32PtrTy(C->getContext());
     else if (LoadTy->isDoubleTy())
@@ -443,7 +448,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   std::string Str;
   if (TD && GetConstantStringInfo(CE, Str) && !Str.empty()) {
     unsigned StrLen = Str.length();
-    const Type *Ty = cast<PointerType>(CE->getType())->getElementType();
+    Type *Ty = cast<PointerType>(CE->getType())->getElementType();
     unsigned NumBits = Ty->getPrimitiveSizeInBits();
     // Replace load with immediate integer if the result is an integer or fp
     // value.
@@ -478,7 +483,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   if (GlobalVariable *GV =
         dyn_cast<GlobalVariable>(GetUnderlyingObject(CE, TD))) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-      const Type *ResTy = cast<PointerType>(C->getType())->getElementType();
+      Type *ResTy = cast<PointerType>(C->getType())->getElementType();
       if (GV->getInitializer()->isNullValue())
         return Constant::getNullValue(ResTy);
       if (isa<UndefValue>(GV->getInitializer()))
@@ -536,19 +541,18 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
 /// CastGEPIndices - If array indices are not pointer-sized integers,
 /// explicitly cast them so that they aren't implicitly casted by the
 /// getelementptr.
-static Constant *CastGEPIndices(Constant *const *Ops, unsigned NumOps,
-                                const Type *ResultTy,
+static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
+                                Type *ResultTy,
                                 const TargetData *TD) {
   if (!TD) return 0;
-  const Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
 
   bool Any = false;
   SmallVector<Constant*, 32> NewIdxs;
-  for (unsigned i = 1; i != NumOps; ++i) {
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
     if ((i == 1 ||
          !isa<StructType>(GetElementPtrInst::getIndexedType(Ops[0]->getType(),
-                                        reinterpret_cast<Value *const *>(Ops+1),
-                                                            i-1))) &&
+                                                        Ops.slice(1, i-1)))) &&
         Ops[i]->getType() != IntPtrTy) {
       Any = true;
       NewIdxs.push_back(ConstantExpr::getCast(CastInst::getCastOpcode(Ops[i],
@@ -562,7 +566,7 @@ static Constant *CastGEPIndices(Constant *const *Ops, unsigned NumOps,
   if (!Any) return 0;
 
   Constant *C =
-    ConstantExpr::getGetElementPtr(Ops[0], &NewIdxs[0], NewIdxs.size());
+    ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
       C = Folded;
@@ -571,23 +575,23 @@ static Constant *CastGEPIndices(Constant *const *Ops, unsigned NumOps,
 
 /// SymbolicallyEvaluateGEP - If we can symbolically evaluate the specified GEP
 /// constant expression, do so.
-static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
-                                         const Type *ResultTy,
+static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
+                                         Type *ResultTy,
                                          const TargetData *TD) {
   Constant *Ptr = Ops[0];
   if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized())
     return 0;
   
-  const Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
 
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
-  for (unsigned i = 1; i != NumOps; ++i)
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     if (!isa<ConstantInt>(Ops[i])) {
       
       // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
       // "inttoptr (sub (ptrtoint Ptr), V)"
-      if (NumOps == 2 &&
+      if (Ops.size() == 2 &&
           cast<PointerType>(ResultTy)->getElementType()->isIntegerTy(8)) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[1]);
         assert((CE == 0 || CE->getType() == IntPtrTy) &&
@@ -606,9 +610,10 @@ static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
     }
   
   unsigned BitWidth = TD->getTypeSizeInBits(IntPtrTy);
-  APInt Offset = APInt(BitWidth,
-                       TD->getIndexedOffset(Ptr->getType(),
-                                            (Value**)Ops+1, NumOps-1));
+  APInt Offset =
+    APInt(BitWidth, TD->getIndexedOffset(Ptr->getType(),
+                                         makeArrayRef((Value **)Ops.data() + 1,
+                                                      Ops.size() - 1)));
   Ptr = cast<Constant>(Ptr->stripPointerCasts());
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
@@ -627,9 +632,7 @@ static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
 
     Ptr = cast<Constant>(GEP->getOperand(0));
     Offset += APInt(BitWidth,
-                    TD->getIndexedOffset(Ptr->getType(),
-                                         (Value**)NestedOps.data(),
-                                         NestedOps.size()));
+                    TD->getIndexedOffset(Ptr->getType(), NestedOps));
     Ptr = cast<Constant>(Ptr->stripPointerCasts());
   }
 
@@ -649,10 +652,10 @@ static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
   // we eliminate over-indexing of the notional static type array bounds.
   // This makes it easy to determine if the getelementptr is "inbounds".
   // Also, this helps GlobalOpt do SROA on GlobalVariables.
-  const Type *Ty = Ptr->getType();
+  Type *Ty = Ptr->getType();
   SmallVector<Constant*, 32> NewIdxs;
   do {
-    if (const SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
+    if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
       if (ATy->isPointerTy()) {
         // The only pointer indexing we'll do is on the first index of the GEP.
         if (!NewIdxs.empty())
@@ -665,7 +668,7 @@ static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
         
       // Determine which element of the array the offset points into.
       APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
-      const IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+      IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
       if (ElemSize == 0)
         // The element size is 0. This may be [0 x Ty]*, so just use a zero
         // index for this level and proceed to the next level to see if it can
@@ -679,7 +682,7 @@ static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
         NewIdxs.push_back(ConstantInt::get(IntPtrTy, NewIdx));
       }
       Ty = ATy->getElementType();
-    } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
       // Determine which field of the struct the offset points into. The
       // getZExtValue is at least as safe as the StructLayout API because we
       // know the offset is within the struct at this point.
@@ -703,7 +706,7 @@ static Constant *SymbolicallyEvaluateGEP(Constant *const *Ops, unsigned NumOps,
 
   // Create a GEP.
   Constant *C =
-    ConstantExpr::getGetElementPtr(Ptr, &NewIdxs[0], NewIdxs.size());
+    ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
   assert(cast<PointerType>(C->getType())->getElementType() == Ty &&
          "Computed GetElementPtr has unexpected type!");
 
@@ -778,8 +781,7 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, const TargetData *TD) {
                                     cast<Constant>(EVI->getAggregateOperand()),
                                     EVI->getIndices());
 
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                  Ops.data(), Ops.size(), TD);
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD);
 }
 
 /// ConstantFoldConstantExpression - Attempt to fold the constant expression
@@ -800,8 +802,7 @@ Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
   if (CE->isCompare())
     return ConstantFoldCompareInstOperands(CE->getPredicate(), Ops[0], Ops[1],
                                            TD);
-  return ConstantFoldInstOperands(CE->getOpcode(), CE->getType(),
-                                  Ops.data(), Ops.size(), TD);
+  return ConstantFoldInstOperands(CE->getOpcode(), CE->getType(), Ops, TD);
 }
 
 /// ConstantFoldInstOperands - Attempt to constant fold an instruction with the
@@ -814,8 +815,8 @@ Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
 /// information, due to only being passed an opcode and operands. Constant
 /// folding using this function strips this information.
 ///
-Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, const Type *DestTy, 
-                                         Constant* const* Ops, unsigned NumOps,
+Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy, 
+                                         ArrayRef<Constant *> Ops,
                                          const TargetData *TD) {
   // Handle easy binops first.
   if (Instruction::isBinaryOp(Opcode)) {
@@ -831,9 +832,9 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, const Type *DestTy,
   case Instruction::ICmp:
   case Instruction::FCmp: assert(0 && "Invalid for compares");
   case Instruction::Call:
-    if (Function *F = dyn_cast<Function>(Ops[NumOps - 1]))
+    if (Function *F = dyn_cast<Function>(Ops.back()))
       if (canConstantFoldCallTo(F))
-        return ConstantFoldCall(F, Ops, NumOps - 1);
+        return ConstantFoldCall(F, Ops.slice(0, Ops.size() - 1));
     return 0;
   case Instruction::PtrToInt:
     // If the input is a inttoptr, eliminate the pair.  This requires knowing
@@ -887,12 +888,12 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, const Type *DestTy,
   case Instruction::ShuffleVector:
     return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
   case Instruction::GetElementPtr:
-    if (Constant *C = CastGEPIndices(Ops, NumOps, DestTy, TD))
+    if (Constant *C = CastGEPIndices(Ops, DestTy, TD))
       return C;
-    if (Constant *C = SymbolicallyEvaluateGEP(Ops, NumOps, DestTy, TD))
+    if (Constant *C = SymbolicallyEvaluateGEP(Ops, DestTy, TD))
       return C;
     
-    return ConstantExpr::getGetElementPtr(Ops[0], Ops+1, NumOps-1);
+    return ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1));
   }
 }
 
@@ -912,7 +913,7 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
   // around to know if bit truncation is happening.
   if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
     if (TD && Ops1->isNullValue()) {
-      const Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
+      Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
       if (CE0->getOpcode() == Instruction::IntToPtr) {
         // Convert the integer value to the right size to ensure we get the
         // proper extension or truncation.
@@ -934,7 +935,7 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
     
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
       if (TD && CE0->getOpcode() == CE1->getOpcode()) {
-        const Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
 
         if (CE0->getOpcode() == Instruction::IntToPtr) {
           // Convert the integer value to the right size to ensure we get the
@@ -967,7 +968,7 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
       unsigned OpC = 
         Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
       Constant *Ops[] = { LHS, RHS };
-      return ConstantFoldInstOperands(OpC, LHS->getType(), Ops, 2, TD);
+      return ConstantFoldInstOperands(OpC, LHS->getType(), Ops, TD);
     }
   }
   
@@ -987,7 +988,7 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
   // addressing...
   gep_type_iterator I = gep_type_begin(CE), E = gep_type_end(CE);
   for (++I; I != E; ++I)
-    if (const StructType *STy = dyn_cast<StructType>(*I)) {
+    if (StructType *STy = dyn_cast<StructType>(*I)) {
       ConstantInt *CU = cast<ConstantInt>(I.getOperand());
       assert(CU->getZExtValue() < STy->getNumElements() &&
              "Struct index out of range!");
@@ -1002,7 +1003,7 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
         return 0;
       }
     } else if (ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand())) {
-      if (const ArrayType *ATy = dyn_cast<ArrayType>(*I)) {
+      if (ArrayType *ATy = dyn_cast<ArrayType>(*I)) {
         if (CI->getZExtValue() >= ATy->getNumElements())
          return 0;
         if (ConstantArray *CA = dyn_cast<ConstantArray>(C))
@@ -1013,7 +1014,7 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
           C = UndefValue::get(ATy->getElementType());
         else
           return 0;
-      } else if (const VectorType *VTy = dyn_cast<VectorType>(*I)) {
+      } else if (VectorType *VTy = dyn_cast<VectorType>(*I)) {
         if (CI->getZExtValue() >= VTy->getNumElements())
           return 0;
         if (ConstantVector *CP = dyn_cast<ConstantVector>(C))
@@ -1101,7 +1102,7 @@ llvm::canConstantFoldCallTo(const Function *F) {
 }
 
 static Constant *ConstantFoldFP(double (*NativeFP)(double), double V, 
-                                const Type *Ty) {
+                                Type *Ty) {
   sys::llvm_fenv_clearexcept();
   V = NativeFP(V);
   if (sys::llvm_fenv_testexcept()) {
@@ -1118,7 +1119,7 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
 }
 
 static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
-                                      double V, double W, const Type *Ty) {
+                                      double V, double W, Type *Ty) {
   sys::llvm_fenv_clearexcept();
   V = NativeFP(V, W);
   if (sys::llvm_fenv_testexcept()) {
@@ -1143,7 +1144,7 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
 /// performed, otherwise returns the Constant value resulting from the
 /// conversion.
 static Constant *ConstantFoldConvertToInt(ConstantFP *Op, bool roundTowardZero,
-                                          const Type *Ty) {
+                                          Type *Ty) {
   assert(Op && "Called with NULL operand");
   APFloat Val(Op->getValueAPF());
 
@@ -1167,13 +1168,12 @@ static Constant *ConstantFoldConvertToInt(ConstantFP *Op, bool roundTowardZero,
 /// ConstantFoldCall - Attempt to constant fold a call to the specified function
 /// with the specified arguments, returning null if unsuccessful.
 Constant *
-llvm::ConstantFoldCall(Function *F, 
-                       Constant *const *Operands, unsigned NumOperands) {
+llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands) {
   if (!F->hasName()) return 0;
   StringRef Name = F->getName();
 
-  const Type *Ty = F->getReturnType();
-  if (NumOperands == 1) {
+  Type *Ty = F->getReturnType();
+  if (Operands.size() == 1) {
     if (ConstantFP *Op = dyn_cast<ConstantFP>(Operands[0])) {
       if (F->getIntrinsicID() == Intrinsic::convert_to_fp16) {
         APFloat Val(Op->getValueAPF());
@@ -1327,7 +1327,7 @@ llvm::ConstantFoldCall(Function *F,
     return 0;
   }
 
-  if (NumOperands == 2) {
+  if (Operands.size() == 2) {
     if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
       if (!Ty->isFloatTy() && !Ty->isDoubleTy())
         return 0;
diff --git a/lib/Analysis/DIBuilder.cpp b/lib/Analysis/DIBuilder.cpp
index ac5eeeb..bfa429d 100644
--- a/lib/Analysis/DIBuilder.cpp
+++ b/lib/Analysis/DIBuilder.cpp
@@ -29,14 +29,74 @@ static Constant *GetTagConstant(LLVMContext &VMContext, unsigned Tag) {
 }
 
 DIBuilder::DIBuilder(Module &m)
-  : M(m), VMContext(M.getContext()), TheCU(0), DeclareFn(0), ValueFn(0) {}
+  : M(m), VMContext(M.getContext()), TheCU(0), TempEnumTypes(0),
+    TempRetainTypes(0), TempSubprograms(0), TempGVs(0), DeclareFn(0),
+    ValueFn(0)
+{}
+
+/// finalize - Construct any deferred debug info descriptors.
+void DIBuilder::finalize() {
+  DIArray Enums = getOrCreateArray(AllEnumTypes);
+  DIType(TempEnumTypes).replaceAllUsesWith(Enums);
+
+  DIArray RetainTypes = getOrCreateArray(AllRetainTypes);
+  DIType(TempRetainTypes).replaceAllUsesWith(RetainTypes);
+
+  DIArray SPs = getOrCreateArray(AllSubprograms);
+  DIType(TempSubprograms).replaceAllUsesWith(SPs);
+  for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+    DISubprogram SP(SPs.getElement(i));
+    if (NamedMDNode *NMD = getFnSpecificMDNode(M, SP)) {
+      SmallVector<Value *, 4> Variables;
+      for (unsigned ii = 0, ee = NMD->getNumOperands(); ii != ee; ++ii)
+        Variables.push_back(NMD->getOperand(ii));
+      if (MDNode *Temp = SP.getVariablesNodes()) {
+        DIArray AV = getOrCreateArray(Variables);
+        DIType(Temp).replaceAllUsesWith(AV);
+      }
+      NMD->eraseFromParent();
+    }
+  }
+
+  DIArray GVs = getOrCreateArray(AllGVs);
+  DIType(TempGVs).replaceAllUsesWith(GVs);
+}
+
+/// getNonCompileUnitScope - If N is compile unit return NULL otherwise return
+/// N.
+static MDNode *getNonCompileUnitScope(MDNode *N) {
+  if (DIDescriptor(N).isCompileUnit())
+    return NULL;
+  return N;
+}
 
 /// createCompileUnit - A CompileUnit provides an anchor for all debugging
 /// information generated during this instance of compilation.
-void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename, 
-                                  StringRef Directory, StringRef Producer, 
-                                  bool isOptimized, StringRef Flags, 
+void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
+                                  StringRef Directory, StringRef Producer,
+                                  bool isOptimized, StringRef Flags,
                                   unsigned RunTimeVer) {
+  assert (Lang <= dwarf::DW_LANG_D && Lang >= dwarf::DW_LANG_C89
+	  && "Invalid Language tag");
+  assert (!Filename.empty() 
+	  && "Unable to create compile unit without filename");
+  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
+  TempEnumTypes = MDNode::getTemporary(VMContext, TElts);
+  Value *THElts[] = { TempEnumTypes };
+  MDNode *EnumHolder = MDNode::get(VMContext, THElts);
+
+  TempRetainTypes = MDNode::getTemporary(VMContext, TElts);
+  Value *TRElts[] = { TempRetainTypes };
+  MDNode *RetainHolder = MDNode::get(VMContext, TRElts);
+
+  TempSubprograms = MDNode::getTemporary(VMContext, TElts);
+  Value *TSElts[] = { TempSubprograms };
+  MDNode *SPHolder = MDNode::get(VMContext, TSElts);
+
+  TempGVs = MDNode::getTemporary(VMContext, TElts);
+  Value *TVElts[] = { TempGVs };
+  MDNode *GVHolder = MDNode::get(VMContext, TVElts);
+
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_compile_unit),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -48,7 +108,11 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
     ConstantInt::get(Type::getInt1Ty(VMContext), true), // isMain
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     MDString::get(VMContext, Flags),
-    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer)
+    ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeVer),
+    EnumHolder,
+    RetainHolder,
+    SPHolder,
+    GVHolder
   };
   TheCU = DICompileUnit(MDNode::get(VMContext, Elts));
 
@@ -61,17 +125,19 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
 /// for a file.
 DIFile DIBuilder::createFile(StringRef Filename, StringRef Directory) {
   assert(TheCU && "Unable to create DW_TAG_file_type without CompileUnit");
+  assert(!Filename.empty() && "Unable to create file without name");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_file_type),
     MDString::get(VMContext, Filename),
     MDString::get(VMContext, Directory),
-    TheCU
+    NULL // TheCU
   };
   return DIFile(MDNode::get(VMContext, Elts));
 }
 
 /// createEnumerator - Create a single enumerator value.
 DIEnumerator DIBuilder::createEnumerator(StringRef Name, uint64_t Val) {
+  assert(!Name.empty() && "Unable to create enumerator without name");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumerator),
     MDString::get(VMContext, Name),
@@ -80,16 +146,37 @@ DIEnumerator DIBuilder::createEnumerator(StringRef Name, uint64_t Val) {
   return DIEnumerator(MDNode::get(VMContext, Elts));
 }
 
-/// createBasicType - Create debugging information entry for a basic 
+/// createNullPtrType - Create C++0x nullptr type.
+DIType DIBuilder::createNullPtrType(StringRef Name) {
+  assert(!Name.empty() && "Unable to create type without name");
+  // nullptr is encoded in DIBasicType format. Line number, filename,
+  // ,size, alignment, offset and flags are always empty here.
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_type),
+    NULL, //TheCU,
+    MDString::get(VMContext, Name),
+    NULL, // Filename
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Encoding
+  };
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
+/// createBasicType - Create debugging information entry for a basic
 /// type, e.g 'char'.
-DIType DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits, 
+DIType DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
                                   uint64_t AlignInBits,
                                   unsigned Encoding) {
+  assert(!Name.empty() && "Unable to create type without name");
   // Basic types are encoded in DIBasicType format. Line number, filename,
   // offset and flags are always empty here.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_base_type),
-    TheCU,
+    NULL, //TheCU,
     MDString::get(VMContext, Name),
     NULL, // Filename
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
@@ -108,7 +195,7 @@ DIType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
   // Qualified types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    TheCU,
+    NULL, //TheCU,
     MDString::get(VMContext, StringRef()), // Empty name.
     NULL, // Filename
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
@@ -127,7 +214,7 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
   // Pointer types are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_pointer_type),
-    TheCU,
+    NULL, //TheCU,
     MDString::get(VMContext, Name),
     NULL, // Filename
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
@@ -142,10 +229,11 @@ DIType DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
 
 /// createReferenceType - Create debugging information entry for a reference.
 DIType DIBuilder::createReferenceType(DIType RTy) {
+  assert(RTy.Verify() && "Unable to create reference type");
   // References are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_reference_type),
-    TheCU,
+    NULL, // TheCU,
     NULL, // Name
     NULL, // Filename
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
@@ -165,7 +253,7 @@ DIType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
   assert(Ty.Verify() && "Invalid typedef type!");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
@@ -199,9 +287,10 @@ DIType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
 }
 
 /// createInheritance - Create debugging information entry to establish
-/// inheritnace relationship between two types.
-DIType DIBuilder::createInheritance(DIType Ty, DIType BaseTy, 
+/// inheritance relationship between two types.
+DIType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
                                     uint64_t BaseOffset, unsigned Flags) {
+  assert(Ty.Verify() && "Unable to create inheritance");
   // TAG_inheritance is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_inheritance),
@@ -219,15 +308,15 @@ DIType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
 }
 
 /// createMemberType - Create debugging information entry for a member.
-DIType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name, 
-                                   DIFile File, unsigned LineNumber, 
+DIType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
+                                   DIFile File, unsigned LineNumber,
                                    uint64_t SizeInBits, uint64_t AlignInBits,
-                                   uint64_t OffsetInBits, unsigned Flags, 
+                                   uint64_t OffsetInBits, unsigned Flags,
                                    DIType Ty) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
-    Scope,
+    getNonCompileUnitScope(Scope),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
@@ -242,17 +331,17 @@ DIType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
 
 /// createObjCIVar - Create debugging information entry for Objective-C
 /// instance variable.
-DIType DIBuilder::createObjCIVar(StringRef Name, 
-                                 DIFile File, unsigned LineNumber, 
+DIType DIBuilder::createObjCIVar(StringRef Name,
+                                 DIFile File, unsigned LineNumber,
                                  uint64_t SizeInBits, uint64_t AlignInBits,
-                                 uint64_t OffsetInBits, unsigned Flags, 
+                                 uint64_t OffsetInBits, unsigned Flags,
                                  DIType Ty, StringRef PropertyName,
                                  StringRef GetterName, StringRef SetterName,
                                  unsigned PropertyAttributes) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
-    File, // Or TheCU ? Ty ?
+    getNonCompileUnitScope(File),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
@@ -270,8 +359,8 @@ DIType DIBuilder::createObjCIVar(StringRef Name,
 }
 
 /// createClassType - Create debugging information entry for a class.
-DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name, 
-                                  DIFile File, unsigned LineNumber, 
+DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
+                                  DIFile File, unsigned LineNumber,
                                   uint64_t SizeInBits, uint64_t AlignInBits,
                                   uint64_t OffsetInBits, unsigned Flags,
                                   DIType DerivedFrom, DIArray Elements,
@@ -279,7 +368,7 @@ DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
  // TAG_class_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
@@ -298,13 +387,13 @@ DIType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
 
 /// createTemplateTypeParameter - Create debugging information for template
 /// type parameter.
-DITemplateTypeParameter 
+DITemplateTypeParameter
 DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
                                        DIType Ty, MDNode *File, unsigned LineNo,
                                        unsigned ColumnNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_template_type_parameter),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     Ty,
     File,
@@ -316,14 +405,14 @@ DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
 
 /// createTemplateValueParameter - Create debugging information for template
 /// value parameter.
-DITemplateValueParameter 
+DITemplateValueParameter
 DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
                                         DIType Ty, uint64_t Val,
                                         MDNode *File, unsigned LineNo,
                                         unsigned ColumnNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_template_value_parameter),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     Ty,
     ConstantInt::get(Type::getInt64Ty(VMContext), Val),
@@ -335,15 +424,15 @@ DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
 }
 
 /// createStructType - Create debugging information entry for a struct.
-DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name, 
-                                   DIFile File, unsigned LineNumber, 
+DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
+                                   DIFile File, unsigned LineNumber,
                                    uint64_t SizeInBits, uint64_t AlignInBits,
-                                   unsigned Flags, DIArray Elements, 
+                                   unsigned Flags, DIArray Elements,
                                    unsigned RunTimeLang) {
  // TAG_structure_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_structure_type),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
@@ -360,7 +449,7 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
 }
 
 /// createUnionType - Create debugging information entry for an union.
-DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name, 
+DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
                                   DIFile File,
                                   unsigned LineNumber, uint64_t SizeInBits,
                                   uint64_t AlignInBits, unsigned Flags,
@@ -368,7 +457,7 @@ DIType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
   // TAG_union_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_union_type),
-    Scope,
+    getNonCompileUnitScope(Scope),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
@@ -389,9 +478,9 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
-    File,
+    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
     MDString::get(VMContext, ""),
-    File,
+    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0),
@@ -405,16 +494,17 @@ DIType DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
   return DIType(MDNode::get(VMContext, Elts));
 }
 
-/// createEnumerationType - Create debugging information entry for an 
+/// createEnumerationType - Create debugging information entry for an
 /// enumeration.
-DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name, 
-                                        DIFile File, unsigned LineNumber, 
-                                        uint64_t SizeInBits, 
-                                        uint64_t AlignInBits, DIArray Elements) {
+DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
+                                        DIFile File, unsigned LineNumber,
+                                        uint64_t SizeInBits,
+                                        uint64_t AlignInBits,
+					DIArray Elements) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
-    Scope,
+    getNonCompileUnitScope(Scope),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
@@ -428,20 +518,19 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.enum");
-  NMD->addOperand(Node);
+  AllEnumTypes.push_back(Node);
   return DIType(Node);
 }
 
 /// createArrayType - Create debugging information entry for an array.
-DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits, 
+DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
                                   DIType Ty, DIArray Subscripts) {
   // TAG_array_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
-    TheCU,
+    NULL, //TheCU,
     MDString::get(VMContext, ""),
-    TheCU,
+    NULL, //TheCU,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
@@ -456,14 +545,14 @@ DIType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
 }
 
 /// createVectorType - Create debugging information entry for a vector.
-DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits, 
+DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
                                    DIType Ty, DIArray Subscripts) {
   // TAG_vector_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_vector_type),
-    TheCU,
+    NULL, //TheCU,
     MDString::get(VMContext, ""),
-    TheCU,
+    NULL, //TheCU,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
@@ -501,18 +590,17 @@ DIType DIBuilder::createArtificialType(DIType Ty) {
   return DIType(MDNode::get(VMContext, Elts));
 }
 
-/// retainType - Retain DIType in a module even if it is not referenced 
+/// retainType - Retain DIType in a module even if it is not referenced
 /// through debug info anchors.
 void DIBuilder::retainType(DIType T) {
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.ty");
-  NMD->addOperand(T);
+  AllRetainTypes.push_back(T);
 }
 
 /// createUnspecifiedParameter - Create unspeicified type descriptor
 /// for the subroutine type.
 DIDescriptor DIBuilder::createUnspecifiedParameter() {
-  Value *Elts[] = { 
-    GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_parameters) 
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_parameters)
   };
   return DIDescriptor(MDNode::get(VMContext, Elts));
 }
@@ -532,7 +620,7 @@ DIType DIBuilder::createTemporaryType(DIFile F) {
   // use here as long as DIType accepts it.
   Value *Elts[] = {
     GetTagConstant(VMContext, DW_TAG_base_type),
-    F.getCompileUnit(),
+    TheCU,
     NULL,
     F
   };
@@ -563,12 +651,12 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Hi) {
 
 /// createGlobalVariable - Create a new descriptor for the specified global.
 DIGlobalVariable DIBuilder::
-createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber, 
+createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
                      DIType Ty, bool isLocalToUnit, llvm::Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    TheCU,
+    NULL, // TheCU,
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
@@ -580,22 +668,20 @@ createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
     Val
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
-  // Create a named metadata so that we do not lose this mdnode.
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
-  NMD->addOperand(Node);
+  AllGVs.push_back(Node);
   return DIGlobalVariable(Node);
 }
 
 /// createStaticVariable - Create a new descriptor for the specified static
 /// variable.
 DIGlobalVariable DIBuilder::
-createStaticVariable(DIDescriptor Context, StringRef Name, 
-                     StringRef LinkageName, DIFile F, unsigned LineNumber, 
+createStaticVariable(DIDescriptor Context, StringRef Name,
+                     StringRef LinkageName, DIFile F, unsigned LineNumber,
                      DIType Ty, bool isLocalToUnit, llvm::Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -607,26 +693,25 @@ createStaticVariable(DIDescriptor Context, StringRef Name,
     Val
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
-  // Create a named metadata so that we do not lose this mdnode.
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
-  NMD->addOperand(Node);
+  AllGVs.push_back(Node);
   return DIGlobalVariable(Node);
 }
 
 /// createVariable - Create a new descriptor for the specified variable.
 DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
                                           StringRef Name, DIFile File,
-                                          unsigned LineNo, DIType Ty, 
+                                          unsigned LineNo, DIType Ty,
                                           bool AlwaysPreserve, unsigned Flags,
                                           unsigned ArgNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
-    Scope,
+    getNonCompileUnitScope(Scope),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))),
     Ty,
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags)
+    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    Constant::getNullValue(Type::getInt32Ty(VMContext)),
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
   if (AlwaysPreserve) {
@@ -634,13 +719,7 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
     // to preserve variable info in such situation then stash it in a
     // named mdnode.
     DISubprogram Fn(getDISubprogram(Scope));
-    StringRef FName = "fn";
-    if (Fn.getFunction())
-      FName = Fn.getFunction()->getName();
-    char One = '\1';
-    if (FName.startswith(StringRef(&One, 1)))
-      FName = FName.substr(1);
-    NamedMDNode *FnLocals = getOrInsertFnSpecificMDNode(M, FName);
+    NamedMDNode *FnLocals = getOrInsertFnSpecificMDNode(M, Fn);
     FnLocals->addOperand(Node);
   }
   return DIVariable(Node);
@@ -655,12 +734,14 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
                                             unsigned ArgNo) {
   SmallVector<Value *, 15> Elts;
   Elts.push_back(GetTagConstant(VMContext, Tag));
-  Elts.push_back(Scope);
+  Elts.push_back(getNonCompileUnitScope(Scope)),
   Elts.push_back(MDString::get(VMContext, Name));
   Elts.push_back(F);
-  Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext), (LineNo | (ArgNo << 24))));
+  Elts.push_back(ConstantInt::get(Type::getInt32Ty(VMContext),
+				  (LineNo | (ArgNo << 24))));
   Elts.push_back(Ty);
   Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)));
   Elts.append(Addr.begin(), Addr.end());
 
   return DIVariable(MDNode::get(VMContext, Elts));
@@ -677,10 +758,15 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
                                        Function *Fn,
                                        MDNode *TParams,
                                        MDNode *Decl) {
+  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
+  MDNode *Temp = MDNode::getTemporary(VMContext, TElts);
+  Value *TVElts[] = { Temp };
+  MDNode *THolder = MDNode::get(VMContext, TVElts);
+
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -696,13 +782,13 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
     TParams,
-    Decl
+    Decl,
+    THolder
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
 
   // Create a named metadata so that we do not lose this mdnode.
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
-  NMD->addOperand(Node);
+  AllSubprograms.push_back(Node);
   return DISubprogram(Node);
 }
 
@@ -720,10 +806,15 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
                                      bool isOptimized,
                                      Function *Fn,
                                      MDNode *TParam) {
+  Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
+  MDNode *Temp = MDNode::getTemporary(VMContext, TElts);
+  Value *TVElts[] = { Temp };
+  MDNode *THolder = MDNode::get(VMContext, TVElts);
+
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    Context,
+    getNonCompileUnitScope(Context),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -739,12 +830,10 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
     TParam,
+    llvm::Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    THolder
   };
   MDNode *Node = MDNode::get(VMContext, Elts);
-
-  // Create a named metadata so that we do not lose this mdnode.
-  NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
-  NMD->addOperand(Node);
   return DISubprogram(Node);
 }
 
@@ -754,7 +843,7 @@ DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name,
                                        DIFile File, unsigned LineNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_namespace),
-    Scope,
+    getNonCompileUnitScope(Scope),
     MDString::get(VMContext, Name),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo)
@@ -762,13 +851,25 @@ DINameSpace DIBuilder::createNameSpace(DIDescriptor Scope, StringRef Name,
   return DINameSpace(MDNode::get(VMContext, Elts));
 }
 
+/// createLexicalBlockFile - This creates a new MDNode that encapsulates
+/// an existing scope with a new filename.
+DILexicalBlockFile DIBuilder::createLexicalBlockFile(DIDescriptor Scope,
+						     DIFile File) {
+  Value *Elts[] = {
+    GetTagConstant(VMContext, dwarf::DW_TAG_lexical_block),
+    Scope,
+    File
+  };
+  return DILexicalBlockFile(MDNode::get(VMContext, Elts));
+}
+
 DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
                                              unsigned Line, unsigned Col) {
   // Defeat MDNode uniqing for lexical blocks by using unique id.
   static unsigned int unique_id = 0;
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_lexical_block),
-    Scope,
+    getNonCompileUnitScope(Scope),
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
     ConstantInt::get(Type::getInt32Ty(VMContext), Col),
     File,
@@ -836,4 +937,3 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
                     VarInfo };
   return CallInst::Create(ValueFn, Args, "", InsertAtEnd);
 }
-
diff --git a/lib/Analysis/DbgInfoPrinter.cpp b/lib/Analysis/DbgInfoPrinter.cpp
index b23c351..cd832ab 100644
--- a/lib/Analysis/DbgInfoPrinter.cpp
+++ b/lib/Analysis/DbgInfoPrinter.cpp
@@ -171,7 +171,7 @@ static bool getLocationInfo(const Value *V, std::string &DisplayName,
 
 void PrintDbgInfo::printVariableDeclaration(const Value *V) {
   std::string DisplayName, File, Directory, Type;
-  unsigned LineNo;
+  unsigned LineNo = 0;
 
   if (!getLocationInfo(V, DisplayName, Type, LineNo, File, Directory))
     return;
diff --git a/lib/Analysis/DebugInfo.cpp b/lib/Analysis/DebugInfo.cpp
index b42e946..44457d3 100644
--- a/lib/Analysis/DebugInfo.cpp
+++ b/lib/Analysis/DebugInfo.cpp
@@ -39,6 +39,9 @@ DIDescriptor::DIDescriptor(const DIFile F) : DbgNode(F.DbgNode) {
 DIDescriptor::DIDescriptor(const DISubprogram F) : DbgNode(F.DbgNode) {
 }
 
+DIDescriptor::DIDescriptor(const DILexicalBlockFile F) : DbgNode(F.DbgNode) {
+}
+
 DIDescriptor::DIDescriptor(const DILexicalBlock F) : DbgNode(F.DbgNode) {
 }
 
@@ -111,9 +114,17 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const {
 unsigned DIVariable::getNumAddrElements() const {
   if (getVersion() <= llvm::LLVMDebugVersion8)
     return DbgNode->getNumOperands()-6;
-  return DbgNode->getNumOperands()-7;
+  if (getVersion() == llvm::LLVMDebugVersion9)
+    return DbgNode->getNumOperands()-7;
+  return DbgNode->getNumOperands()-8;
 }
 
+/// getInlinedAt - If this variable is inlined then return inline location.
+MDNode *DIVariable::getInlinedAt() const {
+  if (getVersion() <= llvm::LLVMDebugVersion9)
+    return NULL;
+  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(7));
+}
 
 //===----------------------------------------------------------------------===//
 // Predicates
@@ -122,7 +133,14 @@ unsigned DIVariable::getNumAddrElements() const {
 /// isBasicType - Return true if the specified tag is legal for
 /// DIBasicType.
 bool DIDescriptor::isBasicType() const {
-  return DbgNode && getTag() == dwarf::DW_TAG_base_type;
+  if (!DbgNode) return false;
+  switch (getTag()) {
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_unspecified_type:
+    return true;
+  default:
+    return false;
+  }
 }
 
 /// isDerivedType - Return true if the specified tag is legal for DIDerivedType.
@@ -248,9 +266,17 @@ bool DIDescriptor::isNameSpace() const {
   return DbgNode && getTag() == dwarf::DW_TAG_namespace;
 }
 
+/// isLexicalBlockFile - Return true if the specified descriptor is a
+/// lexical block with an extra file.
+bool DIDescriptor::isLexicalBlockFile() const {
+  return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
+    (DbgNode->getNumOperands() == 3);
+}
+
 /// isLexicalBlock - Return true if the specified tag is DW_TAG_lexical_block.
 bool DIDescriptor::isLexicalBlock() const {
-  return DbgNode && getTag() == dwarf::DW_TAG_lexical_block;
+  return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
+    (DbgNode->getNumOperands() > 3);
 }
 
 /// isSubrange - Return true if the specified tag is DW_TAG_subrange_type.
@@ -320,6 +346,22 @@ void DIType::replaceAllUsesWith(MDNode *D) {
   }
 }
 
+/// isUnsignedDIType - Return true if type encoding is unsigned.
+bool DIType::isUnsignedDIType() {
+  DIDerivedType DTy(DbgNode);
+  if (DTy.Verify())
+    return DTy.getTypeDerivedFrom().isUnsignedDIType();
+
+  DIBasicType BTy(DbgNode);
+  if (BTy.Verify()) {
+    unsigned Encoding = BTy.getEncoding();
+    if (Encoding == dwarf::DW_ATE_unsigned ||
+        Encoding == dwarf::DW_ATE_unsigned_char)
+      return true;
+  }
+  return false;
+}
+
 /// Verify - Verify that a compile unit is well formed.
 bool DICompileUnit::Verify() const {
   if (!DbgNode)
@@ -335,7 +377,7 @@ bool DICompileUnit::Verify() const {
 bool DIType::Verify() const {
   if (!DbgNode)
     return false;
-  if (!getContext().Verify())
+  if (getContext() && !getContext().Verify())
     return false;
   unsigned Tag = getTag();
   if (!isBasicType() && Tag != dwarf::DW_TAG_const_type &&
@@ -343,6 +385,7 @@ bool DIType::Verify() const {
       Tag != dwarf::DW_TAG_reference_type && Tag != dwarf::DW_TAG_restrict_type 
       && Tag != dwarf::DW_TAG_vector_type && Tag != dwarf::DW_TAG_array_type
       && Tag != dwarf::DW_TAG_enumeration_type 
+      && Tag != dwarf::DW_TAG_subroutine_type
       && getFilename().empty())
     return false;
   return true;
@@ -362,12 +405,9 @@ bool DIDerivedType::Verify() const {
 bool DICompositeType::Verify() const {
   if (!DbgNode)
     return false;
-  if (!getContext().Verify())
+  if (getContext() && !getContext().Verify())
     return false;
 
-  DICompileUnit CU = getCompileUnit();
-  if (!CU.Verify())
-    return false;
   return true;
 }
 
@@ -376,11 +416,7 @@ bool DISubprogram::Verify() const {
   if (!DbgNode)
     return false;
 
-  if (!getContext().Verify())
-    return false;
-
-  DICompileUnit CU = getCompileUnit();
-  if (!CU.Verify())
+  if (getContext() && !getContext().Verify())
     return false;
 
   DICompositeType Ty = getType();
@@ -397,11 +433,7 @@ bool DIGlobalVariable::Verify() const {
   if (getDisplayName().empty())
     return false;
 
-  if (!getContext().Verify())
-    return false;
-
-  DICompileUnit CU = getCompileUnit();
-  if (!CU.Verify())
+  if (getContext() && !getContext().Verify())
     return false;
 
   DIType Ty = getType();
@@ -419,10 +451,7 @@ bool DIVariable::Verify() const {
   if (!DbgNode)
     return false;
 
-  if (!getContext().Verify())
-    return false;
-
-  if (!getCompileUnit().Verify())
+  if (getContext() && !getContext().Verify())
     return false;
 
   DIType Ty = getType();
@@ -446,8 +475,6 @@ bool DINameSpace::Verify() const {
     return false;
   if (getName().empty())
     return false;
-  if (!getCompileUnit().Verify())
-    return false;
   return true;
 }
 
@@ -504,9 +531,28 @@ unsigned DISubprogram::isOptimized() const {
   return 0;
 }
 
+MDNode *DISubprogram::getVariablesNodes() const {
+  if (!DbgNode || DbgNode->getNumOperands() <= 19)
+    return NULL;
+  if (MDNode *Temp = dyn_cast_or_null<MDNode>(DbgNode->getOperand(19)))
+    return dyn_cast_or_null<MDNode>(Temp->getOperand(0));
+  return NULL;
+}
+
+DIArray DISubprogram::getVariables() const {
+  if (!DbgNode || DbgNode->getNumOperands() <= 19)
+    return DIArray();
+  if (MDNode *T = dyn_cast_or_null<MDNode>(DbgNode->getOperand(19)))
+    if (MDNode *A = dyn_cast_or_null<MDNode>(T->getOperand(0)))
+      return DIArray(A);
+  return DIArray();
+}
+
 StringRef DIScope::getFilename() const {
   if (!DbgNode)
     return StringRef();
+  if (isLexicalBlockFile())
+    return DILexicalBlockFile(DbgNode).getFilename();
   if (isLexicalBlock())
     return DILexicalBlock(DbgNode).getFilename();
   if (isSubprogram())
@@ -526,6 +572,8 @@ StringRef DIScope::getFilename() const {
 StringRef DIScope::getDirectory() const {
   if (!DbgNode)
     return StringRef();
+  if (isLexicalBlockFile())
+    return DILexicalBlockFile(DbgNode).getDirectory();
   if (isLexicalBlock())
     return DILexicalBlock(DbgNode).getDirectory();
   if (isSubprogram())
@@ -542,6 +590,47 @@ StringRef DIScope::getDirectory() const {
   return StringRef();
 }
 
+DIArray DICompileUnit::getEnumTypes() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 14)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(10)))
+    if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
+      return DIArray(A);
+  return DIArray();
+}
+
+DIArray DICompileUnit::getRetainedTypes() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 14)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(11)))
+    if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
+      return DIArray(A);
+  return DIArray();
+}
+
+DIArray DICompileUnit::getSubprograms() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 14)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(12)))
+    if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
+      return DIArray(A);
+  return DIArray();
+}
+
+
+DIArray DICompileUnit::getGlobalVariables() const {
+  if (!DbgNode || DbgNode->getNumOperands() < 14)
+    return DIArray();
+
+  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(13)))
+    if (MDNode *A = dyn_cast_or_null<MDNode>(N->getOperand(0)))
+      return DIArray(A);
+  return DIArray();
+}
+
 //===----------------------------------------------------------------------===//
 // DIDescriptor: dump routines for all descriptors.
 //===----------------------------------------------------------------------===//
@@ -573,7 +662,6 @@ void DIType::print(raw_ostream &OS) const {
   OS << " [" << dwarf::TagString(Tag) << "] ";
 
   // TODO : Print context
-  getCompileUnit().print(OS);
   OS << " ["
          << "line " << getLineNumber() << ", "
          << getSizeInBits() << " bits, "
@@ -629,7 +717,6 @@ void DISubprogram::print(raw_ostream &OS) const {
   OS << " [" << dwarf::TagString(Tag) << "] ";
 
   // TODO : Print context
-  getCompileUnit().print(OS);
   OS << " [" << getLineNumber() << "] ";
 
   if (isLocalToUnit())
@@ -652,7 +739,6 @@ void DIGlobalVariable::print(raw_ostream &OS) const {
   OS << " [" << dwarf::TagString(Tag) << "] ";
 
   // TODO : Print context
-  getCompileUnit().print(OS);
   OS << " [" << getLineNumber() << "] ";
 
   if (isLocalToUnit())
@@ -666,13 +752,48 @@ void DIGlobalVariable::print(raw_ostream &OS) const {
   OS << "]\n";
 }
 
+static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
+                          const LLVMContext &Ctx) {
+  if (!DL.isUnknown()) {          // Print source line info.
+    DIScope Scope(DL.getScope(Ctx));
+    // Omit the directory, because it's likely to be long and uninteresting.
+    if (Scope.Verify())
+      CommentOS << Scope.getFilename();
+    else
+      CommentOS << "<unknown>";
+    CommentOS << ':' << DL.getLine();
+    if (DL.getCol() != 0)
+      CommentOS << ':' << DL.getCol();
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(DL.getInlinedAt(Ctx));
+    if (!InlinedAtDL.isUnknown()) {
+      CommentOS << " @[ ";
+      printDebugLoc(InlinedAtDL, CommentOS, Ctx);
+      CommentOS << " ]";
+    }
+  }
+}
+
+void DIVariable::printExtendedName(raw_ostream &OS) const {
+  const LLVMContext &Ctx = DbgNode->getContext();
+  StringRef Res = getName();
+  if (!Res.empty())
+    OS << Res << "," << getLineNumber();
+  if (MDNode *InlinedAt = getInlinedAt()) {
+    DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
+    if (!InlinedAtDL.isUnknown()) {
+      OS << " @[";
+      printDebugLoc(InlinedAtDL, OS, Ctx);
+      OS << "]";
+    }
+  }
+}
+
 /// print - Print variable.
 void DIVariable::print(raw_ostream &OS) const {
   StringRef Res = getName();
   if (!Res.empty())
     OS << " [" << Res << "] ";
 
-  getCompileUnit().print(OS);
   OS << " [" << getLineNumber() << "] ";
   getType().print(OS);
   OS << "\n";
@@ -744,22 +865,61 @@ static void fixupObjcLikeName(StringRef Str, SmallVectorImpl<char> &Out) {
 
 /// getFnSpecificMDNode - Return a NameMDNode, if available, that is 
 /// suitable to hold function specific information.
-NamedMDNode *llvm::getFnSpecificMDNode(const Module &M, StringRef FuncName) {
+NamedMDNode *llvm::getFnSpecificMDNode(const Module &M, DISubprogram Fn) {
   SmallString<32> Name = StringRef("llvm.dbg.lv.");
-  fixupObjcLikeName(FuncName, Name);
-
+  StringRef FName = "fn";
+  if (Fn.getFunction())
+    FName = Fn.getFunction()->getName();
+  else
+    FName = Fn.getName();
+  char One = '\1';
+  if (FName.startswith(StringRef(&One, 1)))
+    FName = FName.substr(1);
+  fixupObjcLikeName(FName, Name);
   return M.getNamedMetadata(Name.str());
 }
 
 /// getOrInsertFnSpecificMDNode - Return a NameMDNode that is suitable
 /// to hold function specific information.
-NamedMDNode *llvm::getOrInsertFnSpecificMDNode(Module &M, StringRef FuncName) {
+NamedMDNode *llvm::getOrInsertFnSpecificMDNode(Module &M, DISubprogram Fn) {
   SmallString<32> Name = StringRef("llvm.dbg.lv.");
-  fixupObjcLikeName(FuncName, Name);
-
+  StringRef FName = "fn";
+  if (Fn.getFunction())
+    FName = Fn.getFunction()->getName();
+  else
+    FName = Fn.getName();
+  char One = '\1';
+  if (FName.startswith(StringRef(&One, 1)))
+    FName = FName.substr(1);
+  fixupObjcLikeName(FName, Name);
+  
   return M.getOrInsertNamedMetadata(Name.str());
 }
 
+/// createInlinedVariable - Create a new inlined variable based on current
+/// variable.
+/// @param DV            Current Variable.
+/// @param InlinedScope  Location at current variable is inlined.
+DIVariable llvm::createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
+                                       LLVMContext &VMContext) {
+  SmallVector<Value *, 16> Elts;
+  // Insert inlined scope as 7th element.
+  for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
+    i == 7 ? Elts.push_back(InlinedScope) :
+             Elts.push_back(DV->getOperand(i));
+  return DIVariable(MDNode::get(VMContext, Elts));
+}
+
+/// cleanseInlinedVariable - Remove inlined scope from the variable.
+DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
+  SmallVector<Value *, 16> Elts;
+  // Insert inlined scope as 7th element.
+  for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
+    i == 7 ? 
+      Elts.push_back(llvm::Constant::getNullValue(Type::getInt32Ty(VMContext))):
+      Elts.push_back(DV->getOperand(i));
+  return DIVariable(MDNode::get(VMContext, Elts));
+}
 
 //===----------------------------------------------------------------------===//
 // DebugInfoFinder implementations.
@@ -767,6 +927,10 @@ NamedMDNode *llvm::getOrInsertFnSpecificMDNode(Module &M, StringRef FuncName) {
 
 /// processModule - Process entire module and collect debug info.
 void DebugInfoFinder::processModule(Module &M) {
+  if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu"))
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i)
+      addCompileUnit(DICompileUnit(CU_Nodes->getOperand(i)));
+
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
     for (Function::iterator FI = (*I).begin(), FE = (*I).end(); FI != FE; ++FI)
       for (BasicBlock::iterator BI = (*FI).begin(), BE = (*FI).end(); BI != BE;
@@ -785,6 +949,10 @@ void DebugInfoFinder::processModule(Module &M) {
           addCompileUnit(DICompileUnit(Scope));
         else if (Scope.isSubprogram())
           processSubprogram(DISubprogram(Scope));
+        else if (Scope.isLexicalBlockFile()) {
+          DILexicalBlockFile DBF = DILexicalBlockFile(Scope);
+          processLexicalBlock(DILexicalBlock(DBF.getScope()));
+        }
         else if (Scope.isLexicalBlock())
           processLexicalBlock(DILexicalBlock(Scope));
 
@@ -796,7 +964,8 @@ void DebugInfoFinder::processModule(Module &M) {
     for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
       DIGlobalVariable DIG(cast<MDNode>(NMD->getOperand(i)));
       if (addGlobalVariable(DIG)) {
-        addCompileUnit(DIG.getCompileUnit());
+        if (DIG.getVersion() <= LLVMDebugVersion10)
+          addCompileUnit(DIG.getCompileUnit());
         processType(DIG.getType());
       }
     }
@@ -817,6 +986,10 @@ void DebugInfoFinder::processLocation(DILocation Loc) {
     processSubprogram(DISubprogram(S));
   else if (S.isLexicalBlock())
     processLexicalBlock(DILexicalBlock(S));
+  else if (S.isLexicalBlockFile()) {
+    DILexicalBlockFile DBF = DILexicalBlockFile(S);
+    processLexicalBlock(DILexicalBlock(DBF.getScope()));
+  }
   processLocation(Loc.getOrigLocation());
 }
 
@@ -824,8 +997,8 @@ void DebugInfoFinder::processLocation(DILocation Loc) {
 void DebugInfoFinder::processType(DIType DT) {
   if (!addType(DT))
     return;
-
-  addCompileUnit(DT.getCompileUnit());
+  if (DT.getVersion() <= LLVMDebugVersion10)
+    addCompileUnit(DT.getCompileUnit());
   if (DT.isCompositeType()) {
     DICompositeType DCT(DT);
     processType(DCT.getTypeDerivedFrom());
@@ -848,6 +1021,10 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
   DIScope Context = LB.getContext();
   if (Context.isLexicalBlock())
     return processLexicalBlock(DILexicalBlock(Context));
+  else if (Context.isLexicalBlockFile()) {
+    DILexicalBlockFile DBF = DILexicalBlockFile(Context);
+    return processLexicalBlock(DILexicalBlock(DBF.getScope()));
+  }
   else
     return processSubprogram(DISubprogram(Context));
 }
@@ -856,7 +1033,8 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
 void DebugInfoFinder::processSubprogram(DISubprogram SP) {
   if (!addSubprogram(SP))
     return;
-  addCompileUnit(SP.getCompileUnit());
+  if (SP.getVersion() <= LLVMDebugVersion10)
+    addCompileUnit(SP.getCompileUnit());
   processType(SP.getType());
 }
 
@@ -871,8 +1049,8 @@ void DebugInfoFinder::processDeclare(DbgDeclareInst *DDI) {
 
   if (!NodesSeen.insert(DV))
     return;
-
-  addCompileUnit(DIVariable(N).getCompileUnit());
+  if (DIVariable(N).getVersion() <= LLVMDebugVersion10)
+    addCompileUnit(DIVariable(N).getCompileUnit());
   processType(DIVariable(N).getType());
 }
 
@@ -930,6 +1108,9 @@ DISubprogram llvm::getDISubprogram(const MDNode *Scope) {
   if (D.isSubprogram())
     return DISubprogram(Scope);
 
+  if (D.isLexicalBlockFile())
+    return getDISubprogram(DILexicalBlockFile(Scope).getContext());
+  
   if (D.isLexicalBlock())
     return getDISubprogram(DILexicalBlock(Scope).getContext());
 
@@ -946,3 +1127,17 @@ DICompositeType llvm::getDICompositeType(DIType T) {
 
   return DICompositeType();
 }
+
+/// isSubprogramContext - Return true if Context is either a subprogram
+/// or another context nested inside a subprogram.
+bool llvm::isSubprogramContext(const MDNode *Context) {
+  if (!Context)
+    return false;
+  DIDescriptor D(Context);
+  if (D.isSubprogram())
+    return true;
+  if (D.isType())
+    return isSubprogramContext(DIType(Context).getContext());
+  return false;
+}
+
diff --git a/lib/Analysis/IPA/CMakeLists.txt b/lib/Analysis/IPA/CMakeLists.txt
index 8ffef29..eae83fd 100644
--- a/lib/Analysis/IPA/CMakeLists.txt
+++ b/lib/Analysis/IPA/CMakeLists.txt
@@ -5,3 +5,9 @@ add_llvm_library(LLVMipa
   GlobalsModRef.cpp
   IPA.cpp
   )
+
+add_llvm_library_dependencies(LLVMipa
+  LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  )
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index 659ffab..963da75 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -44,8 +44,8 @@ namespace {
 class CGPassManager : public ModulePass, public PMDataManager {
 public:
   static char ID;
-  explicit CGPassManager(int Depth) 
-    : ModulePass(ID), PMDataManager(Depth) { }
+  explicit CGPassManager() 
+    : ModulePass(ID), PMDataManager() { }
 
   /// run - Execute all of the passes scheduled for execution.  Keep track of
   /// whether any of the passes modifies the module, and if so, return true.
@@ -350,6 +350,7 @@ bool CGPassManager::RefreshCallGraph(CallGraphSCC &CurSCC,
            dbgs() << "CGSCCPASSMGR: SCC Refresh didn't change call graph.\n";
          }
         );
+  (void)MadeChange;
 
   return DevirtualizedCall;
 }
@@ -542,7 +543,7 @@ void CallGraphSCCPass::assignPassManager(PMStack &PMS,
     PMDataManager *PMD = PMS.top();
 
     // [1] Create new Call Graph Pass Manager
-    CGP = new CGPassManager(PMD->getDepth() + 1);
+    CGP = new CGPassManager();
 
     // [2] Set up new manager's top level manager
     PMTopLevelManager *TPM = PMD->getTopLevelManager();
diff --git a/lib/Analysis/IPA/FindUsedTypes.cpp b/lib/Analysis/IPA/FindUsedTypes.cpp
index 6535786..e9df3ca 100644
--- a/lib/Analysis/IPA/FindUsedTypes.cpp
+++ b/lib/Analysis/IPA/FindUsedTypes.cpp
@@ -29,7 +29,7 @@ INITIALIZE_PASS(FindUsedTypes, "print-used-types",
 // IncorporateType - Incorporate one type and all of its subtypes into the
 // collection of used types.
 //
-void FindUsedTypes::IncorporateType(const Type *Ty) {
+void FindUsedTypes::IncorporateType(Type *Ty) {
   // If ty doesn't already exist in the used types map, add it now, otherwise
   // return.
   if (!UsedTypes.insert(Ty)) return;  // Already contain Ty.
@@ -94,7 +94,7 @@ bool FindUsedTypes::runOnModule(Module &m) {
 //
 void FindUsedTypes::print(raw_ostream &OS, const Module *M) const {
   OS << "Types in use by this module:\n";
-  for (SetVector<const Type *>::const_iterator I = UsedTypes.begin(),
+  for (SetVector<Type *>::const_iterator I = UsedTypes.begin(),
        E = UsedTypes.end(); I != E; ++I) {
     OS << "   " << **I << '\n';
   }
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index e5f0a77..d0ca892 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -146,7 +146,8 @@ bool IVUsers::AddUsersIfInteresting(Instruction *I) {
                                    ISE, User, I,
                                    NewUse.PostIncLoops,
                                    *SE, *DT);
-      DEBUG(dbgs() << "   NORMALIZED TO: " << *ISE << '\n');
+      DEBUG(if (SE->getSCEV(I) != ISE)
+              dbgs() << "   NORMALIZED TO: " << *ISE << '\n');
     }
   }
   return true;
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index efde598..e12e322 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/CallingConv.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
@@ -24,13 +25,13 @@ using namespace llvm;
 /// TODO: Perhaps calls like memcpy, strcpy, etc?
 bool llvm::callIsSmall(const Function *F) {
   if (!F) return false;
-  
+
   if (F->hasLocalLinkage()) return false;
-  
+
   if (!F->hasName()) return false;
-  
+
   StringRef Name = F->getName();
-  
+
   // These will all likely lower to a single selection DAG node.
   if (Name == "copysign" || Name == "copysignf" || Name == "copysignl" ||
       Name == "fabs" || Name == "fabsf" || Name == "fabsl" ||
@@ -38,7 +39,7 @@ bool llvm::callIsSmall(const Function *F) {
       Name == "cos" || Name == "cosf" || Name == "cosl" ||
       Name == "sqrt" || Name == "sqrtf" || Name == "sqrtl" )
     return true;
-  
+
   // These are all likely to be optimized into something smaller.
   if (Name == "pow" || Name == "powf" || Name == "powl" ||
       Name == "exp2" || Name == "exp2l" || Name == "exp2f" ||
@@ -46,13 +47,14 @@ bool llvm::callIsSmall(const Function *F) {
       Name == "round" || Name == "ffs" || Name == "ffsl" ||
       Name == "abs" || Name == "labs" || Name == "llabs")
     return true;
-  
+
   return false;
 }
 
 /// analyzeBasicBlock - Fill in the current structure with information gleaned
 /// from the specified block.
-void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
+void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
+                                    const TargetData *TD) {
   ++NumBlocks;
   unsigned NumInstsBeforeThisBB = NumInsts;
   for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
@@ -67,8 +69,8 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
       ImmutableCallSite CS(cast<Instruction>(II));
 
       if (const Function *F = CS.getCalledFunction()) {
-        // If a function is both internal and has a single use, then it is 
-        // extremely likely to get inlined in the future (it was probably 
+        // If a function is both internal and has a single use, then it is
+        // extremely likely to get inlined in the future (it was probably
         // exposed by an interleaved devirtualization pass).
         if (F->hasInternalLinkage() && F->hasOneUse())
           ++NumInlineCandidates;
@@ -91,20 +93,25 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
           ++NumCalls;
       }
     }
-    
+
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
       if (!AI->isStaticAlloca())
         this->usesDynamicAlloca = true;
     }
 
     if (isa<ExtractElementInst>(II) || II->getType()->isVectorTy())
-      ++NumVectorInsts; 
-    
+      ++NumVectorInsts;
+
     if (const CastInst *CI = dyn_cast<CastInst>(II)) {
       // Noop casts, including ptr <-> int,  don't count.
-      if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) || 
+      if (CI->isLosslessCast() || isa<IntToPtrInst>(CI) ||
           isa<PtrToIntInst>(CI))
         continue;
+      // trunc to a native type is free (assuming the target has compare and
+      // shift-right of the same width).
+      if (isa<TruncInst>(CI) && TD &&
+          TD->isLegalInteger(TD->getTypeSizeInBits(CI->getType())))
+        continue;
       // Result of a cmp instruction is often extended (to be used by other
       // cmp instructions, logical or return instructions). These are usually
       // nop on most sane targets.
@@ -119,10 +126,10 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB) {
 
     ++NumInsts;
   }
-  
+
   if (isa<ReturnInst>(BB->getTerminator()))
     ++NumRets;
-  
+
   // We never want to inline functions that contain an indirectbr.  This is
   // incorrect because all the blockaddress's (in static global initializers
   // for example) would be referring to the original function, and this indirect
@@ -217,7 +224,7 @@ unsigned CodeMetrics::CountCodeReductionForAlloca(Value *V) {
 
 /// analyzeFunction - Fill in the current structure with information gleaned
 /// from the specified function.
-void CodeMetrics::analyzeFunction(Function *F) {
+void CodeMetrics::analyzeFunction(Function *F, const TargetData *TD) {
   // If this function contains a call to setjmp or _setjmp, never inline
   // it.  This is a hack because we depend on the user marking their local
   // variables as volatile if they are live across a setjmp call, and they
@@ -227,13 +234,14 @@ void CodeMetrics::analyzeFunction(Function *F) {
 
   // Look at the size of the callee.
   for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-    analyzeBasicBlock(&*BB);
+    analyzeBasicBlock(&*BB, TD);
 }
 
 /// analyzeFunction - Fill in the current structure with information gleaned
 /// from the specified function.
-void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F) {
-  Metrics.analyzeFunction(F);
+void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F,
+                                                       const TargetData *TD) {
+  Metrics.analyzeFunction(F, TD);
 
   // A function with exactly one return has it removed during the inlining
   // process (see InlineFunction), so don't count it.
@@ -252,7 +260,7 @@ void InlineCostAnalyzer::FunctionInfo::analyzeFunction(Function *F) {
 /// NeverInline - returns true if the function should never be inlined into
 /// any caller
 bool InlineCostAnalyzer::FunctionInfo::NeverInline() {
-  return (Metrics.callsSetJmp || Metrics.isRecursive || 
+  return (Metrics.callsSetJmp || Metrics.isRecursive ||
           Metrics.containsIndirectBr);
 }
 // getSpecializationBonus - The heuristic used to determine the per-call
@@ -263,19 +271,19 @@ int InlineCostAnalyzer::getSpecializationBonus(Function *Callee,
 {
   if (Callee->mayBeOverridden())
     return 0;
-  
+
   int Bonus = 0;
   // If this function uses the coldcc calling convention, prefer not to
   // specialize it.
   if (Callee->getCallingConv() == CallingConv::Cold)
     Bonus -= InlineConstants::ColdccPenalty;
-  
+
   // Get information about the callee.
   FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-  
+
   // If we haven't calculated this information yet, do so now.
   if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee);
+    CalleeFI->analyzeFunction(Callee, TD);
 
   unsigned ArgNo = 0;
   unsigned i = 0;
@@ -286,7 +294,7 @@ int InlineCostAnalyzer::getSpecializationBonus(Function *Callee,
       Bonus += CountBonusForConstant(I);
     }
 
-  // Calls usually take a long time, so they make the specialization gain 
+  // Calls usually take a long time, so they make the specialization gain
   // smaller.
   Bonus -= CalleeFI->Metrics.NumCalls * InlineConstants::CallPenalty;
 
@@ -300,13 +308,13 @@ int InlineCostAnalyzer::getSpecializationBonus(Function *Callee,
 // inlining because we decide we don't want to give a bonus for
 // devirtualizing.
 int InlineCostAnalyzer::ConstantFunctionBonus(CallSite CS, Constant *C) {
-  
+
   // This could just be NULL.
   if (!C) return 0;
-  
+
   Function *F = dyn_cast<Function>(C);
   if (!F) return 0;
-  
+
   int Bonus = InlineConstants::IndirectCallBonus + getInlineSize(CS, F);
   return (Bonus > 0) ? 0 : Bonus;
 }
@@ -355,18 +363,18 @@ int InlineCostAnalyzer::CountBonusForConstant(Value *V, Constant *C) {
         Bonus += CountBonusForConstant(&Inst);
     }
   }
-  
+
   return Bonus;
 }
 
 int InlineCostAnalyzer::getInlineSize(CallSite CS, Function *Callee) {
   // Get information about the callee.
   FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-  
+
   // If we haven't calculated this information yet, do so now.
   if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee);
-  
+    CalleeFI->analyzeFunction(Callee, TD);
+
   // InlineCost - This value measures how good of an inline candidate this call
   // site is to inline.  A lower inline cost make is more likely for the call to
   // be inlined.  This value may go negative.
@@ -392,9 +400,9 @@ int InlineCostAnalyzer::getInlineSize(CallSite CS, Function *Callee) {
     // weights calculated for the callee to determine how much will be folded
     // away with this information.
     else if (isa<Constant>(I))
-      InlineCost -= CalleeFI->ArgumentWeights[ArgNo].ConstantWeight;       
+      InlineCost -= CalleeFI->ArgumentWeights[ArgNo].ConstantWeight;
   }
-  
+
   // Each argument passed in has a cost at both the caller and the callee
   // sides.  Measurements show that each argument costs about the same as an
   // instruction.
@@ -408,28 +416,28 @@ int InlineCostAnalyzer::getInlineSize(CallSite CS, Function *Callee) {
 
   // Look at the size of the callee. Each instruction counts as 5.
   InlineCost += CalleeFI->Metrics.NumInsts*InlineConstants::InstrCost;
-  
+
   return InlineCost;
 }
 
 int InlineCostAnalyzer::getInlineBonuses(CallSite CS, Function *Callee) {
   // Get information about the callee.
   FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-  
+
   // If we haven't calculated this information yet, do so now.
   if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee);
-    
+    CalleeFI->analyzeFunction(Callee, TD);
+
   bool isDirectCall = CS.getCalledFunction() == Callee;
   Instruction *TheCall = CS.getInstruction();
   int Bonus = 0;
-  
+
   // If there is only one call of the function, and it has internal linkage,
   // make it almost guaranteed to be inlined.
   //
   if (Callee->hasLocalLinkage() && Callee->hasOneUse() && isDirectCall)
     Bonus += InlineConstants::LastCallToStaticBonus;
-  
+
   // If the instruction after the call, or if the normal destination of the
   // invoke is an unreachable instruction, the function is noreturn.  As such,
   // there is little point in inlining this.
@@ -438,12 +446,12 @@ int InlineCostAnalyzer::getInlineBonuses(CallSite CS, Function *Callee) {
       Bonus += InlineConstants::NoreturnPenalty;
   } else if (isa<UnreachableInst>(++BasicBlock::iterator(TheCall)))
     Bonus += InlineConstants::NoreturnPenalty;
-  
+
   // If this function uses the coldcc calling convention, prefer not to inline
   // it.
   if (Callee->getCallingConv() == CallingConv::Cold)
     Bonus += InlineConstants::ColdccPenalty;
-  
+
   // Add to the inline quality for properties that make the call valuable to
   // inline.  This includes factors that indicate that the result of inlining
   // the function will be optimizable.  Currently this just looks at arguments
@@ -455,7 +463,7 @@ int InlineCostAnalyzer::getInlineBonuses(CallSite CS, Function *Callee) {
     // Compute any constant bonus due to inlining we want to give here.
     if (isa<Constant>(I))
       Bonus += CountBonusForConstant(FI, cast<Constant>(I));
-      
+
   return Bonus;
 }
 
@@ -483,10 +491,10 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
 
   // Get information about the callee.
   FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-  
+
   // If we haven't calculated this information yet, do so now.
   if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee);
+    CalleeFI->analyzeFunction(Callee, TD);
 
   // If we should never inline this, return a huge cost.
   if (CalleeFI->NeverInline())
@@ -498,15 +506,15 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS,
   // requires handling setjmp somewhere else, however.
   if (!Callee->isDeclaration() && Callee->hasFnAttr(Attribute::AlwaysInline))
     return InlineCost::getAlways();
-    
+
   if (CalleeFI->Metrics.usesDynamicAlloca) {
     // Get information about the caller.
     FunctionInfo &CallerFI = CachedFunctionInfo[Caller];
 
     // If we haven't calculated this information yet, do so now.
     if (CallerFI.Metrics.NumBlocks == 0) {
-      CallerFI.analyzeFunction(Caller);
-     
+      CallerFI.analyzeFunction(Caller, TD);
+
       // Recompute the CalleeFI pointer, getting Caller could have invalidated
       // it.
       CalleeFI = &CachedFunctionInfo[Callee];
@@ -538,16 +546,16 @@ InlineCost InlineCostAnalyzer::getSpecializationCost(Function *Callee,
   // something else.
   if (Callee->mayBeOverridden())
     return llvm::InlineCost::getNever();
-  
+
   // Get information about the callee.
   FunctionInfo *CalleeFI = &CachedFunctionInfo[Callee];
-  
+
   // If we haven't calculated this information yet, do so now.
   if (CalleeFI->Metrics.NumBlocks == 0)
-    CalleeFI->analyzeFunction(Callee);
+    CalleeFI->analyzeFunction(Callee, TD);
 
   int Cost = 0;
-  
+
   // Look at the original size of the callee.  Each instruction counts as 5.
   Cost += CalleeFI->Metrics.NumInsts * InlineConstants::InstrCost;
 
@@ -564,13 +572,13 @@ InlineCost InlineCostAnalyzer::getSpecializationCost(Function *Callee,
 // higher threshold to determine if the function call should be inlined.
 float InlineCostAnalyzer::getInlineFudgeFactor(CallSite CS) {
   Function *Callee = CS.getCalledFunction();
-  
+
   // Get information about the callee.
   FunctionInfo &CalleeFI = CachedFunctionInfo[Callee];
-  
+
   // If we haven't calculated this information yet, do so now.
   if (CalleeFI.Metrics.NumBlocks == 0)
-    CalleeFI.analyzeFunction(Callee);
+    CalleeFI.analyzeFunction(Callee, TD);
 
   float Factor = 1.0f;
   // Single BB functions are often written to be inlined.
@@ -604,7 +612,7 @@ InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) {
     --CallerMetrics.NumCalls;
 
   if (Callee == 0) return;
-  
+
   CodeMetrics &CalleeMetrics = CachedFunctionInfo[Callee].Metrics;
 
   // If we don't have metrics for the callee, don't recalculate them just to
@@ -614,7 +622,7 @@ InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) {
     resetCachedCostInfo(Caller);
     return;
   }
-  
+
   // Since CalleeMetrics were already calculated, we know that the CallerMetrics
   // reference isn't invalidated: both were in the DenseMap.
   CallerMetrics.usesDynamicAlloca |= CalleeMetrics.usesDynamicAlloca;
@@ -636,7 +644,7 @@ InlineCostAnalyzer::growCachedCostInfo(Function *Caller, Function *Callee) {
     CallerMetrics.NumInsts -= Callee->arg_size();
   else
     CallerMetrics.NumInsts = 0;
-  
+
   // We are not updating the argument weights. We have already determined that
   // Caller is a fairly large function, so we accept the loss of precision.
 }
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 8709f6b..131cc97 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -48,6 +48,26 @@ static Value *SimplifyOrInst(Value *, Value *, const TargetData *,
 static Value *SimplifyXorInst(Value *, Value *, const TargetData *,
                               const DominatorTree *, unsigned);
 
+/// getFalse - For a boolean type, or a vector of boolean type, return false, or
+/// a vector with every element false, as appropriate for the type.
+static Constant *getFalse(Type *Ty) {
+  assert((Ty->isIntegerTy(1) ||
+          (Ty->isVectorTy() &&
+           cast<VectorType>(Ty)->getElementType()->isIntegerTy(1))) &&
+         "Expected i1 type or a vector of i1!");
+  return Constant::getNullValue(Ty);
+}
+
+/// getTrue - For a boolean type, or a vector of boolean type, return true, or
+/// a vector with every element true, as appropriate for the type.
+static Constant *getTrue(Type *Ty) {
+  assert((Ty->isIntegerTy(1) ||
+          (Ty->isVectorTy() &&
+           cast<VectorType>(Ty)->getElementType()->isIntegerTy(1))) &&
+         "Expected i1 type or a vector of i1!");
+  return Constant::getAllOnesValue(Ty);
+}
+
 /// ValueDominatesPHI - Does the given value dominate the specified phi node?
 static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   Instruction *I = dyn_cast<Instruction>(V);
@@ -526,7 +546,7 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Add, CLHS->getType(),
-                                      Ops, 2, TD);
+                                      Ops, TD);
     }
 
     // Canonicalize the constant to the RHS.
@@ -595,7 +615,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Sub, CLHS->getType(),
-                                      Ops, 2, TD);
+                                      Ops, TD);
     }
 
   // X - undef -> undef
@@ -715,7 +735,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const TargetData *TD,
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Mul, CLHS->getType(),
-                                      Ops, 2, TD);
+                                      Ops, TD);
     }
 
     // Canonicalize the constant to the RHS.
@@ -788,7 +808,7 @@ static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
     if (Constant *C1 = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, 2, TD);
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, TD);
     }
   }
 
@@ -909,7 +929,7 @@ static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
     if (Constant *C1 = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, 2, TD);
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, TD);
     }
   }
 
@@ -1012,7 +1032,7 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
   if (Constant *C0 = dyn_cast<Constant>(Op0)) {
     if (Constant *C1 = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { C0, C1 };
-      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, 2, TD);
+      return ConstantFoldInstOperands(Opcode, C0->getType(), Ops, TD);
     }
   }
 
@@ -1138,7 +1158,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const TargetData *TD,
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::And, CLHS->getType(),
-                                      Ops, 2, TD);
+                                      Ops, TD);
     }
 
     // Canonicalize the constant to the RHS.
@@ -1227,7 +1247,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const TargetData *TD,
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Or, CLHS->getType(),
-                                      Ops, 2, TD);
+                                      Ops, TD);
     }
 
     // Canonicalize the constant to the RHS.
@@ -1321,7 +1341,7 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const TargetData *TD,
     if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
       Constant *Ops[] = { CLHS, CRHS };
       return ConstantFoldInstOperands(Instruction::Xor, CLHS->getType(),
-                                      Ops, 2, TD);
+                                      Ops, TD);
     }
 
     // Canonicalize the constant to the RHS.
@@ -1372,7 +1392,7 @@ Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const TargetData *TD,
   return ::SimplifyXorInst(Op0, Op1, TD, DT, RecursionLimit);
 }
 
-static const Type *GetCompareTy(Value *Op) {
+static Type *GetCompareTy(Value *Op) {
   return CmpInst::makeCmpResultType(Op->getType());
 }
 
@@ -1413,8 +1433,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     Pred = CmpInst::getSwappedPredicate(Pred);
   }
 
-  const Type *ITy = GetCompareTy(LHS); // The return type.
-  const Type *OpTy = LHS->getType();   // The operand type.
+  Type *ITy = GetCompareTy(LHS); // The return type.
+  Type *OpTy = LHS->getType();   // The operand type.
 
   // icmp X, X -> true/false
   // X icmp undef -> true/false.  For example, icmp ugt %X, undef -> false
@@ -1478,48 +1498,46 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     default:
       assert(false && "Unknown ICmp predicate!");
     case ICmpInst::ICMP_ULT:
-      // getNullValue also works for vectors, unlike getFalse.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
     case ICmpInst::ICMP_UGE:
-      // getAllOnesValue also works for vectors, unlike getTrue.
-      return ConstantInt::getAllOnesValue(ITy);
+      return getTrue(ITy);
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULE:
       if (isKnownNonZero(LHS, TD))
-        return Constant::getNullValue(ITy);
+        return getFalse(ITy);
       break;
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
       if (isKnownNonZero(LHS, TD))
-        return ConstantInt::getAllOnesValue(ITy);
+        return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SLT:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getAllOnesValue(ITy);
+        return getTrue(ITy);
       if (LHSKnownNonNegative)
-        return Constant::getNullValue(ITy);
+        return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return ConstantInt::getAllOnesValue(ITy);
+        return getTrue(ITy);
       if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
-        return Constant::getNullValue(ITy);
+        return getFalse(ITy);
       break;
     case ICmpInst::ICMP_SGE:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return Constant::getNullValue(ITy);
+        return getFalse(ITy);
       if (LHSKnownNonNegative)
-        return ConstantInt::getAllOnesValue(ITy);
+        return getTrue(ITy);
       break;
     case ICmpInst::ICMP_SGT:
       ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, TD);
       if (LHSKnownNegative)
-        return Constant::getNullValue(ITy);
+        return getFalse(ITy);
       if (LHSKnownNonNegative && isKnownNonZero(LHS, TD))
-        return ConstantInt::getAllOnesValue(ITy);
+        return getTrue(ITy);
       break;
     }
   }
@@ -1593,8 +1611,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (isa<CastInst>(LHS) && (isa<Constant>(RHS) || isa<CastInst>(RHS))) {
     Instruction *LI = cast<CastInst>(LHS);
     Value *SrcOp = LI->getOperand(0);
-    const Type *SrcTy = SrcOp->getType();
-    const Type *DstTy = LI->getType();
+    Type *SrcTy = SrcOp->getType();
+    Type *DstTy = LI->getType();
 
     // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
     // if the integer type is the same size as the pointer type.
@@ -1811,8 +1829,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
-      // getNullValue also works for vectors, unlike getFalse.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(LHS, KnownNonNegative, KnownNegative, TD);
@@ -1822,8 +1839,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
-      // getAllOnesValue also works for vectors, unlike getTrue.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     }
   }
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
@@ -1840,8 +1856,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_NE:
     case ICmpInst::ICMP_UGT:
     case ICmpInst::ICMP_UGE:
-      // getAllOnesValue also works for vectors, unlike getTrue.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
       ComputeSignBit(RHS, KnownNonNegative, KnownNegative, TD);
@@ -1851,8 +1866,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     case ICmpInst::ICMP_EQ:
     case ICmpInst::ICMP_ULT:
     case ICmpInst::ICMP_ULE:
-      // getNullValue also works for vectors, unlike getFalse.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
     }
   }
 
@@ -1874,7 +1888,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         return V;
       break;
     case Instruction::Shl: {
-      bool NUW = LBO->hasNoUnsignedWrap() && LBO->hasNoUnsignedWrap();
+      bool NUW = LBO->hasNoUnsignedWrap() && RBO->hasNoUnsignedWrap();
       bool NSW = LBO->hasNoSignedWrap() && RBO->hasNoSignedWrap();
       if (!NUW && !NSW)
         break;
@@ -1955,10 +1969,10 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
     case CmpInst::ICMP_SGE:
       // Always true.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     case CmpInst::ICMP_SLT:
       // Always false.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
     }
   }
 
@@ -2025,10 +2039,10 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
     case CmpInst::ICMP_UGE:
       // Always true.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     case CmpInst::ICMP_ULT:
       // Always false.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
     }
   }
 
@@ -2040,40 +2054,40 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     // max(x, ?) pred min(x, ?).
     if (Pred == CmpInst::ICMP_SGE)
       // Always true.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     if (Pred == CmpInst::ICMP_SLT)
       // Always false.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
   } else if (match(LHS, m_SMin(m_Value(A), m_Value(B))) &&
              match(RHS, m_SMax(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // min(x, ?) pred max(x, ?).
     if (Pred == CmpInst::ICMP_SLE)
       // Always true.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     if (Pred == CmpInst::ICMP_SGT)
       // Always false.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
   } else if (match(LHS, m_UMax(m_Value(A), m_Value(B))) &&
              match(RHS, m_UMin(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // max(x, ?) pred min(x, ?).
     if (Pred == CmpInst::ICMP_UGE)
       // Always true.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     if (Pred == CmpInst::ICMP_ULT)
       // Always false.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
   } else if (match(LHS, m_UMin(m_Value(A), m_Value(B))) &&
              match(RHS, m_UMax(m_Value(C), m_Value(D))) &&
              (A == C || A == D || B == C || B == D)) {
     // min(x, ?) pred max(x, ?).
     if (Pred == CmpInst::ICMP_ULE)
       // Always true.
-      return Constant::getAllOnesValue(ITy);
+      return getTrue(ITy);
     if (Pred == CmpInst::ICMP_UGT)
       // Always false.
-      return Constant::getNullValue(ITy);
+      return getFalse(ITy);
   }
 
   // If the comparison is with the result of a select instruction, check whether
@@ -2219,43 +2233,71 @@ Value *llvm::SimplifySelectInst(Value *CondVal, Value *TrueVal, Value *FalseVal,
 
 /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
 /// fold the result.  If not, this returns null.
-Value *llvm::SimplifyGEPInst(Value *const *Ops, unsigned NumOps,
+Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops,
                              const TargetData *TD, const DominatorTree *) {
   // The type of the GEP pointer operand.
-  const PointerType *PtrTy = cast<PointerType>(Ops[0]->getType());
+  PointerType *PtrTy = cast<PointerType>(Ops[0]->getType());
 
   // getelementptr P -> P.
-  if (NumOps == 1)
+  if (Ops.size() == 1)
     return Ops[0];
 
   if (isa<UndefValue>(Ops[0])) {
     // Compute the (pointer) type returned by the GEP instruction.
-    const Type *LastType = GetElementPtrInst::getIndexedType(PtrTy, &Ops[1],
-                                                             NumOps-1);
-    const Type *GEPTy = PointerType::get(LastType, PtrTy->getAddressSpace());
+    Type *LastType = GetElementPtrInst::getIndexedType(PtrTy, Ops.slice(1));
+    Type *GEPTy = PointerType::get(LastType, PtrTy->getAddressSpace());
     return UndefValue::get(GEPTy);
   }
 
-  if (NumOps == 2) {
+  if (Ops.size() == 2) {
     // getelementptr P, 0 -> P.
     if (ConstantInt *C = dyn_cast<ConstantInt>(Ops[1]))
       if (C->isZero())
         return Ops[0];
     // getelementptr P, N -> P if P points to a type of zero size.
     if (TD) {
-      const Type *Ty = PtrTy->getElementType();
+      Type *Ty = PtrTy->getElementType();
       if (Ty->isSized() && TD->getTypeAllocSize(Ty) == 0)
         return Ops[0];
     }
   }
 
   // Check to see if this is constant foldable.
-  for (unsigned i = 0; i != NumOps; ++i)
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     if (!isa<Constant>(Ops[i]))
       return 0;
 
-  return ConstantExpr::getGetElementPtr(cast<Constant>(Ops[0]),
-                                        (Constant *const*)Ops+1, NumOps-1);
+  return ConstantExpr::getGetElementPtr(cast<Constant>(Ops[0]), Ops.slice(1));
+}
+
+/// SimplifyInsertValueInst - Given operands for an InsertValueInst, see if we
+/// can fold the result.  If not, this returns null.
+Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
+                                     ArrayRef<unsigned> Idxs,
+                                     const TargetData *,
+                                     const DominatorTree *) {
+  if (Constant *CAgg = dyn_cast<Constant>(Agg))
+    if (Constant *CVal = dyn_cast<Constant>(Val))
+      return ConstantFoldInsertValueInstruction(CAgg, CVal, Idxs);
+
+  // insertvalue x, undef, n -> x
+  if (match(Val, m_Undef()))
+    return Agg;
+
+  // insertvalue x, (extractvalue y, n), n
+  if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Val))
+    if (EV->getAggregateOperand()->getType() == Agg->getType() &&
+        EV->getIndices() == Idxs) {
+      // insertvalue undef, (extractvalue y, n), n -> y
+      if (match(Agg, m_Undef()))
+        return EV->getAggregateOperand();
+
+      // insertvalue y, (extractvalue y, n), n -> y
+      if (Agg == EV->getAggregateOperand())
+        return Agg;
+    }
+
+  return 0;
 }
 
 /// SimplifyPHINode - See if we can fold the given phi.  If not, returns null.
@@ -2328,7 +2370,7 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
     if (Constant *CLHS = dyn_cast<Constant>(LHS))
       if (Constant *CRHS = dyn_cast<Constant>(RHS)) {
         Constant *COps[] = {CLHS, CRHS};
-        return ConstantFoldInstOperands(Opcode, LHS->getType(), COps, 2, TD);
+        return ConstantFoldInstOperands(Opcode, LHS->getType(), COps, TD);
       }
 
     // If the operation is associative, try some generic simplifications.
@@ -2456,7 +2498,14 @@ Value *llvm::SimplifyInstruction(Instruction *I, const TargetData *TD,
     break;
   case Instruction::GetElementPtr: {
     SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end());
-    Result = SimplifyGEPInst(&Ops[0], Ops.size(), TD, DT);
+    Result = SimplifyGEPInst(Ops, TD, DT);
+    break;
+  }
+  case Instruction::InsertValue: {
+    InsertValueInst *IV = cast<InsertValueInst>(I);
+    Result = SimplifyInsertValueInst(IV->getAggregateOperand(),
+                                     IV->getInsertedValueOperand(),
+                                     IV->getIndices(), TD, DT);
     break;
   }
   case Instruction::PHI:
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 6e27597..f80595c 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -630,7 +630,7 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
   if (BB == &BB->getParent()->getEntryBlock()) {
     assert(isa<Argument>(Val) && "Unknown live-in to the entry block");
     if (NotNull) {
-      const PointerType *PTy = cast<PointerType>(Val->getType());
+      PointerType *PTy = cast<PointerType>(Val->getType());
       Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
     } else {
       Result.markOverdefined();
@@ -658,7 +658,7 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
       // If we previously determined that this is a pointer that can't be null
       // then return that rather than giving up entirely.
       if (NotNull) {
-        const PointerType *PTy = cast<PointerType>(Val->getType());
+        PointerType *PTy = cast<PointerType>(Val->getType());
         Result = LVILatticeVal::getNot(ConstantPointerNull::get(PTy));
       }
       
@@ -728,7 +728,7 @@ bool LazyValueInfoCache::solveBlockValueConstantRange(LVILatticeVal &BBLV,
   
   ConstantRange LHSRange = LHSVal.getConstantRange();
   ConstantRange RHSRange(1);
-  const IntegerType *ResultTy = cast<IntegerType>(BBI->getType());
+  IntegerType *ResultTy = cast<IntegerType>(BBI->getType());
   if (isa<BinaryOperator>(BBI)) {
     if (ConstantInt *RHS = dyn_cast<ConstantInt>(BBI->getOperand(1))) {
       RHSRange = ConstantRange(RHS->getValue());
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 89755da..38d677d 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -71,7 +71,7 @@ namespace {
     void visitCallSite(CallSite CS);
     void visitMemoryReference(Instruction &I, Value *Ptr,
                               uint64_t Size, unsigned Align,
-                              const Type *Ty, unsigned Flags);
+                              Type *Ty, unsigned Flags);
 
     void visitCallInst(CallInst &I);
     void visitInvokeInst(InvokeInst &I);
@@ -201,7 +201,7 @@ void Lint::visitCallSite(CallSite CS) {
             "Undefined behavior: Caller and callee calling convention differ",
             &I);
 
-    const FunctionType *FT = F->getFunctionType();
+    FunctionType *FT = F->getFunctionType();
     unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
 
     Assert1(FT->isVarArg() ?
@@ -240,7 +240,7 @@ void Lint::visitCallSite(CallSite CS) {
 
         // Check that an sret argument points to valid memory.
         if (Formal->hasStructRetAttr() && Actual->getType()->isPointerTy()) {
-          const Type *Ty =
+          Type *Ty =
             cast<PointerType>(Formal->getType())->getElementType();
           visitMemoryReference(I, Actual, AA->getTypeStoreSize(Ty),
                                TD ? TD->getABITypeAlignment(Ty) : 0,
@@ -364,7 +364,7 @@ void Lint::visitReturnInst(ReturnInst &I) {
 // TODO: Check readnone/readonly function attributes.
 void Lint::visitMemoryReference(Instruction &I,
                                 Value *Ptr, uint64_t Size, unsigned Align,
-                                const Type *Ty, unsigned Flags) {
+                                Type *Ty, unsigned Flags) {
   // If no memory is being referenced, it doesn't matter if the pointer
   // is valid.
   if (Size == 0)
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index c5c676b..0e6bcbf 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -63,7 +63,7 @@ static Value *getUnderlyingObjectWithOffset(Value *V, const TargetData *TD,
         return V;
       SmallVector<Value*, 8> Indices(GEP->op_begin() + 1, GEP->op_end());
       ByteOffset += TD->getIndexedOffset(GEP->getPointerOperandType(),
-                                         &Indices[0], Indices.size());
+                                         Indices);
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
@@ -90,7 +90,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
   if (TD)
     Base = getUnderlyingObjectWithOffset(V, TD, ByteOffset);
 
-  const Type *BaseType = 0;
+  Type *BaseType = 0;
   unsigned BaseAlign = 0;
   if (const AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
     // An alloca is safe to load from as load as it is suitably aligned.
@@ -114,7 +114,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
         return true; // Loading directly from an alloca or global is OK.
 
       // Check if the load is within the bounds of the underlying object.
-      const PointerType *AddrTy = cast<PointerType>(V->getType());
+      PointerType *AddrTy = cast<PointerType>(V->getType());
       uint64_t LoadSize = TD->getTypeStoreSize(AddrTy->getElementType());
       if (ByteOffset + LoadSize <= TD->getTypeAllocSize(BaseType) &&
           (Align == 0 || (ByteOffset % Align) == 0))
@@ -169,7 +169,7 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
   // If we're using alias analysis to disambiguate get the size of *Ptr.
   uint64_t AccessSize = 0;
   if (AA) {
-    const Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
+    Type *AccessTy = cast<PointerType>(Ptr->getType())->getElementType();
     AccessSize = AA->getTypeStoreSize(AccessTy);
   }
   
@@ -188,12 +188,16 @@ Value *llvm::FindAvailableLoadedValue(Value *Ptr, BasicBlock *ScanBB,
     
     --ScanFrom;
     // If this is a load of Ptr, the loaded value is available.
+    // (This is true even if the load is volatile or atomic, although
+    // those cases are unlikely.)
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
       if (AreEquivalentAddressValues(LI->getOperand(0), Ptr))
         return LI;
     
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       // If this is a store through Ptr, the value is available!
+      // (This is true even if the store is volatile or atomic, although
+      // those cases are unlikely.)
       if (AreEquivalentAddressValues(SI->getOperand(1), Ptr))
         return SI->getOperand(0);
       
diff --git a/lib/Analysis/LoopDependenceAnalysis.cpp b/lib/Analysis/LoopDependenceAnalysis.cpp
index c1afe8f..3997ac4 100644
--- a/lib/Analysis/LoopDependenceAnalysis.cpp
+++ b/lib/Analysis/LoopDependenceAnalysis.cpp
@@ -76,7 +76,13 @@ static void GetMemRefInstrs(const Loop *L,
 }
 
 static bool IsLoadOrStoreInst(Value *I) {
-  return isa<LoadInst>(I) || isa<StoreInst>(I);
+  // Returns true if the load or store can be analyzed. Atomic and volatile
+  // operations have properties which this analysis does not understand.
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isUnordered();
+  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+  return false;
 }
 
 static Value *GetPointerOperand(Value *I) {
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 0583140..85aacca 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
@@ -55,12 +56,12 @@ bool Loop::isLoopInvariant(Value *V) const {
 }
 
 /// hasLoopInvariantOperands - Return true if all the operands of the
-/// specified instruction are loop invariant. 
+/// specified instruction are loop invariant.
 bool Loop::hasLoopInvariantOperands(Instruction *I) const {
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
     if (!isLoopInvariant(I->getOperand(i)))
       return false;
-  
+
   return true;
 }
 
@@ -98,6 +99,9 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
     return false;
   if (I->mayReadFromMemory())
     return false;
+  // The landingpad instruction is immobile.
+  if (isa<LandingPadInst>(I))
+    return false;
   // Determine the insertion point, unless one was given.
   if (!InsertPt) {
     BasicBlock *Preheader = getLoopPreheader();
@@ -110,7 +114,7 @@ bool Loop::makeLoopInvariant(Instruction *I, bool &Changed,
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
     if (!makeLoopInvariant(I->getOperand(i), Changed, InsertPt))
       return false;
-  
+
   // Hoist.
   I->moveBefore(InsertPt);
   Changed = true;
@@ -383,6 +387,205 @@ void Loop::dump() const {
 }
 
 //===----------------------------------------------------------------------===//
+// UnloopUpdater implementation
+//
+
+namespace {
+/// Find the new parent loop for all blocks within the "unloop" whose last
+/// backedges has just been removed.
+class UnloopUpdater {
+  Loop *Unloop;
+  LoopInfo *LI;
+
+  LoopBlocksDFS DFS;
+
+  // Map unloop's immediate subloops to their nearest reachable parents. Nested
+  // loops within these subloops will not change parents. However, an immediate
+  // subloop's new parent will be the nearest loop reachable from either its own
+  // exits *or* any of its nested loop's exits.
+  DenseMap<Loop*, Loop*> SubloopParents;
+
+  // Flag the presence of an irreducible backedge whose destination is a block
+  // directly contained by the original unloop.
+  bool FoundIB;
+
+public:
+  UnloopUpdater(Loop *UL, LoopInfo *LInfo) :
+    Unloop(UL), LI(LInfo), DFS(UL), FoundIB(false) {}
+
+  void updateBlockParents();
+
+  void removeBlocksFromAncestors();
+
+  void updateSubloopParents();
+
+protected:
+  Loop *getNearestLoop(BasicBlock *BB, Loop *BBLoop);
+};
+} // end anonymous namespace
+
+/// updateBlockParents - Update the parent loop for all blocks that are directly
+/// contained within the original "unloop".
+void UnloopUpdater::updateBlockParents() {
+  if (Unloop->getNumBlocks()) {
+    // Perform a post order CFG traversal of all blocks within this loop,
+    // propagating the nearest loop from sucessors to predecessors.
+    LoopBlocksTraversal Traversal(DFS, LI);
+    for (LoopBlocksTraversal::POTIterator POI = Traversal.begin(),
+           POE = Traversal.end(); POI != POE; ++POI) {
+
+      Loop *L = LI->getLoopFor(*POI);
+      Loop *NL = getNearestLoop(*POI, L);
+
+      if (NL != L) {
+        // For reducible loops, NL is now an ancestor of Unloop.
+        assert((NL != Unloop && (!NL || NL->contains(Unloop))) &&
+               "uninitialized successor");
+        LI->changeLoopFor(*POI, NL);
+      }
+      else {
+        // Or the current block is part of a subloop, in which case its parent
+        // is unchanged.
+        assert((FoundIB || Unloop->contains(L)) && "uninitialized successor");
+      }
+    }
+  }
+  // Each irreducible loop within the unloop induces a round of iteration using
+  // the DFS result cached by Traversal.
+  bool Changed = FoundIB;
+  for (unsigned NIters = 0; Changed; ++NIters) {
+    assert(NIters < Unloop->getNumBlocks() && "runaway iterative algorithm");
+
+    // Iterate over the postorder list of blocks, propagating the nearest loop
+    // from successors to predecessors as before.
+    Changed = false;
+    for (LoopBlocksDFS::POIterator POI = DFS.beginPostorder(),
+           POE = DFS.endPostorder(); POI != POE; ++POI) {
+
+      Loop *L = LI->getLoopFor(*POI);
+      Loop *NL = getNearestLoop(*POI, L);
+      if (NL != L) {
+        assert(NL != Unloop && (!NL || NL->contains(Unloop)) &&
+               "uninitialized successor");
+        LI->changeLoopFor(*POI, NL);
+        Changed = true;
+      }
+    }
+  }
+}
+
+/// removeBlocksFromAncestors - Remove unloop's blocks from all ancestors below
+/// their new parents.
+void UnloopUpdater::removeBlocksFromAncestors() {
+  // Remove unloop's blocks from all ancestors below their new parents.
+  for (Loop::block_iterator BI = Unloop->block_begin(),
+         BE = Unloop->block_end(); BI != BE; ++BI) {
+    Loop *NewParent = LI->getLoopFor(*BI);
+    // If this block is an immediate subloop, remove all blocks (including
+    // nested subloops) from ancestors below the new parent loop.
+    // Otherwise, if this block is in a nested subloop, skip it.
+    if (SubloopParents.count(NewParent))
+      NewParent = SubloopParents[NewParent];
+    else if (Unloop->contains(NewParent))
+      continue;
+
+    // Remove blocks from former Ancestors except Unloop itself which will be
+    // deleted.
+    for (Loop *OldParent = Unloop->getParentLoop(); OldParent != NewParent;
+         OldParent = OldParent->getParentLoop()) {
+      assert(OldParent && "new loop is not an ancestor of the original");
+      OldParent->removeBlockFromLoop(*BI);
+    }
+  }
+}
+
+/// updateSubloopParents - Update the parent loop for all subloops directly
+/// nested within unloop.
+void UnloopUpdater::updateSubloopParents() {
+  while (!Unloop->empty()) {
+    Loop *Subloop = *llvm::prior(Unloop->end());
+    Unloop->removeChildLoop(llvm::prior(Unloop->end()));
+
+    assert(SubloopParents.count(Subloop) && "DFS failed to visit subloop");
+    if (SubloopParents[Subloop])
+      SubloopParents[Subloop]->addChildLoop(Subloop);
+    else
+      LI->addTopLevelLoop(Subloop);
+  }
+}
+
+/// getNearestLoop - Return the nearest parent loop among this block's
+/// successors. If a successor is a subloop header, consider its parent to be
+/// the nearest parent of the subloop's exits.
+///
+/// For subloop blocks, simply update SubloopParents and return NULL.
+Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
+
+  // Initially for blocks directly contained by Unloop, NearLoop == Unloop and
+  // is considered uninitialized.
+  Loop *NearLoop = BBLoop;
+
+  Loop *Subloop = 0;
+  if (NearLoop != Unloop && Unloop->contains(NearLoop)) {
+    Subloop = NearLoop;
+    // Find the subloop ancestor that is directly contained within Unloop.
+    while (Subloop->getParentLoop() != Unloop) {
+      Subloop = Subloop->getParentLoop();
+      assert(Subloop && "subloop is not an ancestor of the original loop");
+    }
+    // Get the current nearest parent of the Subloop exits, initially Unloop.
+    if (!SubloopParents.count(Subloop))
+      SubloopParents[Subloop] = Unloop;
+    NearLoop = SubloopParents[Subloop];
+  }
+
+  succ_iterator I = succ_begin(BB), E = succ_end(BB);
+  if (I == E) {
+    assert(!Subloop && "subloop blocks must have a successor");
+    NearLoop = 0; // unloop blocks may now exit the function.
+  }
+  for (; I != E; ++I) {
+    if (*I == BB)
+      continue; // self loops are uninteresting
+
+    Loop *L = LI->getLoopFor(*I);
+    if (L == Unloop) {
+      // This successor has not been processed. This path must lead to an
+      // irreducible backedge.
+      assert((FoundIB || !DFS.hasPostorder(*I)) && "should have seen IB");
+      FoundIB = true;
+    }
+    if (L != Unloop && Unloop->contains(L)) {
+      // Successor is in a subloop.
+      if (Subloop)
+        continue; // Branching within subloops. Ignore it.
+
+      // BB branches from the original into a subloop header.
+      assert(L->getParentLoop() == Unloop && "cannot skip into nested loops");
+
+      // Get the current nearest parent of the Subloop's exits.
+      L = SubloopParents[L];
+      // L could be Unloop if the only exit was an irreducible backedge.
+    }
+    if (L == Unloop) {
+      continue;
+    }
+    // Handle critical edges from Unloop into a sibling loop.
+    if (L && !L->contains(Unloop)) {
+      L = L->getParentLoop();
+    }
+    // Remember the nearest parent loop among successors or subloop exits.
+    if (NearLoop == Unloop || !NearLoop || NearLoop->contains(L))
+      NearLoop = L;
+  }
+  if (Subloop) {
+    SubloopParents[Subloop] = NearLoop;
+    return BBLoop;
+  }
+  return NearLoop;
+}
+
+//===----------------------------------------------------------------------===//
 // LoopInfo implementation
 //
 bool LoopInfo::runOnFunction(Function &) {
@@ -391,6 +594,68 @@ bool LoopInfo::runOnFunction(Function &) {
   return false;
 }
 
+/// updateUnloop - The last backedge has been removed from a loop--now the
+/// "unloop". Find a new parent for the blocks contained within unloop and
+/// update the loop tree. We don't necessarily have valid dominators at this
+/// point, but LoopInfo is still valid except for the removal of this loop.
+///
+/// Note that Unloop may now be an empty loop. Calling Loop::getHeader without
+/// checking first is illegal.
+void LoopInfo::updateUnloop(Loop *Unloop) {
+
+  // First handle the special case of no parent loop to simplify the algorithm.
+  if (!Unloop->getParentLoop()) {
+    // Since BBLoop had no parent, Unloop blocks are no longer in a loop.
+    for (Loop::block_iterator I = Unloop->block_begin(),
+         E = Unloop->block_end(); I != E; ++I) {
+
+      // Don't reparent blocks in subloops.
+      if (getLoopFor(*I) != Unloop)
+        continue;
+
+      // Blocks no longer have a parent but are still referenced by Unloop until
+      // the Unloop object is deleted.
+      LI.changeLoopFor(*I, 0);
+    }
+
+    // Remove the loop from the top-level LoopInfo object.
+    for (LoopInfo::iterator I = LI.begin();; ++I) {
+      assert(I != LI.end() && "Couldn't find loop");
+      if (*I == Unloop) {
+        LI.removeLoop(I);
+        break;
+      }
+    }
+
+    // Move all of the subloops to the top-level.
+    while (!Unloop->empty())
+      LI.addTopLevelLoop(Unloop->removeChildLoop(llvm::prior(Unloop->end())));
+
+    return;
+  }
+
+  // Update the parent loop for all blocks within the loop. Blocks within
+  // subloops will not change parents.
+  UnloopUpdater Updater(Unloop, this);
+  Updater.updateBlockParents();
+
+  // Remove blocks from former ancestor loops.
+  Updater.removeBlocksFromAncestors();
+
+  // Add direct subloops as children in their new parent loop.
+  Updater.updateSubloopParents();
+
+  // Remove unloop from its parent loop.
+  Loop *ParentLoop = Unloop->getParentLoop();
+  for (Loop::iterator I = ParentLoop->begin();; ++I) {
+    assert(I != ParentLoop->end() && "Couldn't find loop");
+    if (*I == Unloop) {
+      ParentLoop->removeChildLoop(I);
+      break;
+    }
+  }
+}
+
 void LoopInfo::verifyAnalysis() const {
   // LoopInfo is a FunctionPass, but verifying every loop in the function
   // each time verifyAnalysis is called is very expensive. The
@@ -400,12 +665,21 @@ void LoopInfo::verifyAnalysis() const {
 
   if (!VerifyLoopInfo) return;
 
+  DenseSet<const Loop*> Loops;
   for (iterator I = begin(), E = end(); I != E; ++I) {
     assert(!(*I)->getParentLoop() && "Top-level loop has a parent!");
-    (*I)->verifyLoopNest();
+    (*I)->verifyLoopNest(&Loops);
   }
 
-  // TODO: check BBMap consistency.
+  // Verify that blocks are mapped to valid loops.
+  //
+  // FIXME: With an up-to-date DFS (see LoopIterator.h) and DominatorTree, we
+  // could also verify that the blocks are still in the correct loops.
+  for (DenseMap<BasicBlock*, Loop*>::const_iterator I = LI.BBMap.begin(),
+         E = LI.BBMap.end(); I != E; ++I) {
+    assert(Loops.count(I->second) && "orphaned loop");
+    assert(I->second->contains(I->first) && "orphaned block");
+  }
 }
 
 void LoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -417,3 +691,15 @@ void LoopInfo::print(raw_ostream &OS, const Module*) const {
   LI.print(OS);
 }
 
+//===----------------------------------------------------------------------===//
+// LoopBlocksDFS implementation
+//
+
+/// Traverse the loop blocks and store the DFS result.
+/// Useful for clients that just want the final DFS result and don't need to
+/// visit blocks during the initial traversal.
+void LoopBlocksDFS::perform(LoopInfo *LI) {
+  LoopBlocksTraversal Traversal(*this, LI);
+  for (LoopBlocksTraversal::POTIterator POI = Traversal.begin(),
+         POE = Traversal.end(); POI != POE; ++POI) ;
+}
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 10e3f29..5ba1f40 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -59,9 +59,9 @@ char PrintLoopPass::ID = 0;
 static DebugInfoProbeInfo *TheDebugProbe;
 static void createDebugInfoProbe() {
   if (TheDebugProbe) return;
-      
-  // Constructed the first time this is called. This guarantees that the 
-  // object will be constructed, if -enable-debug-info-probe is set, 
+
+  // Constructed the first time this is called. This guarantees that the
+  // object will be constructed, if -enable-debug-info-probe is set,
   // before static globals, thus it will be destroyed before them.
   static ManagedStatic<DebugInfoProbeInfo> DIP;
   TheDebugProbe = &*DIP;
@@ -73,73 +73,29 @@ static void createDebugInfoProbe() {
 
 char LPPassManager::ID = 0;
 
-LPPassManager::LPPassManager(int Depth) 
-  : FunctionPass(ID), PMDataManager(Depth) { 
+LPPassManager::LPPassManager()
+  : FunctionPass(ID), PMDataManager() {
   skipThisLoop = false;
   redoThisLoop = false;
   LI = NULL;
   CurrentLoop = NULL;
 }
 
-/// Delete loop from the loop queue and loop hierarchy (LoopInfo). 
+/// Delete loop from the loop queue and loop hierarchy (LoopInfo).
 void LPPassManager::deleteLoopFromQueue(Loop *L) {
 
-  if (Loop *ParentLoop = L->getParentLoop()) { // Not a top-level loop.
-    // Reparent all of the blocks in this loop.  Since BBLoop had a parent,
-    // they are now all in it.
-    for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); 
-         I != E; ++I)
-      if (LI->getLoopFor(*I) == L)    // Don't change blocks in subloops.
-        LI->changeLoopFor(*I, ParentLoop);
-    
-    // Remove the loop from its parent loop.
-    for (Loop::iterator I = ParentLoop->begin(), E = ParentLoop->end();;
-         ++I) {
-      assert(I != E && "Couldn't find loop");
-      if (*I == L) {
-        ParentLoop->removeChildLoop(I);
-        break;
-      }
-    }
-    
-    // Move all subloops into the parent loop.
-    while (!L->empty())
-      ParentLoop->addChildLoop(L->removeChildLoop(L->end()-1));
-  } else {
-    // Reparent all of the blocks in this loop.  Since BBLoop had no parent,
-    // they no longer in a loop at all.
-    
-    for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
-      // Don't change blocks in subloops.
-      if (LI->getLoopFor(L->getBlocks()[i]) == L) {
-        LI->removeBlock(L->getBlocks()[i]);
-        --i;
-      }
-    }
-
-    // Remove the loop from the top-level LoopInfo object.
-    for (LoopInfo::iterator I = LI->begin(), E = LI->end();; ++I) {
-      assert(I != E && "Couldn't find loop");
-      if (*I == L) {
-        LI->removeLoop(I);
-        break;
-      }
-    }
-
-    // Move all of the subloops to the top-level.
-    while (!L->empty())
-      LI->addTopLevelLoop(L->removeChildLoop(L->end()-1));
-  }
-
-  delete L;
+  LI->updateUnloop(L);
 
   // If L is current loop then skip rest of the passes and let
   // runOnFunction remove L from LQ. Otherwise, remove L from LQ now
   // and continue applying other passes on CurrentLoop.
-  if (CurrentLoop == L) {
+  if (CurrentLoop == L)
     skipThisLoop = true;
+
+  delete L;
+
+  if (skipThisLoop)
     return;
-  }
 
   for (std::deque<Loop *>::iterator I = LQ.begin(),
          E = LQ.end(); I != E; ++I) {
@@ -166,10 +122,10 @@ void LPPassManager::insertLoop(Loop *L, Loop *ParentLoop) {
 
 void LPPassManager::insertLoopIntoQueue(Loop *L) {
   // Insert L into loop queue
-  if (L == CurrentLoop) 
+  if (L == CurrentLoop)
     redoLoop(L);
   else if (!L->getParentLoop())
-    // This is top level loop. 
+    // This is top level loop.
     LQ.push_front(L);
   else {
     // Insert L after the parent loop.
@@ -195,9 +151,9 @@ void LPPassManager::redoLoop(Loop *L) {
 
 /// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for
 /// all loop passes.
-void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From, 
+void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From,
                                                   BasicBlock *To, Loop *L) {
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
     LoopPass *LP = getContainedPass(Index);
     LP->cloneBasicBlockAnalysis(From, To, L);
   }
@@ -206,13 +162,13 @@ void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From,
 /// deleteSimpleAnalysisValue - Invoke deleteAnalysisValue hook for all passes.
 void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
   if (BasicBlock *BB = dyn_cast<BasicBlock>(V)) {
-    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; 
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;
          ++BI) {
       Instruction &I = *BI;
       deleteSimpleAnalysisValue(&I, L);
     }
   }
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
     LoopPass *LP = getContainedPass(Index);
     LP->deleteAnalysisValue(V, L);
   }
@@ -228,7 +184,7 @@ static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {
 
 /// Pass Manager itself does not invalidate any analysis info.
 void LPPassManager::getAnalysisUsage(AnalysisUsage &Info) const {
-  // LPPassManager needs LoopInfo. In the long term LoopInfo class will 
+  // LPPassManager needs LoopInfo. In the long term LoopInfo class will
   // become part of LPPassManager.
   Info.addRequired<LoopInfo>();
   Info.setPreservesAll();
@@ -255,7 +211,7 @@ bool LPPassManager::runOnFunction(Function &F) {
   for (std::deque<Loop *>::const_iterator I = LQ.begin(), E = LQ.end();
        I != E; ++I) {
     Loop *L = *I;
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       LoopPass *P = getContainedPass(Index);
       Changed |= P->doInitialization(L, *this);
     }
@@ -263,13 +219,13 @@ bool LPPassManager::runOnFunction(Function &F) {
 
   // Walk Loops
   while (!LQ.empty()) {
-      
+
     CurrentLoop  = LQ.back();
     skipThisLoop = false;
     redoThisLoop = false;
 
     // Run all passes on the current Loop.
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       LoopPass *P = getContainedPass(Index);
       dumpPassInfo(P, EXECUTION_MSG, ON_LOOP_MSG,
                    CurrentLoop->getHeader()->getName());
@@ -319,23 +275,23 @@ bool LPPassManager::runOnFunction(Function &F) {
         // Do not run other passes on this loop.
         break;
     }
-    
+
     // If the loop was deleted, release all the loop passes. This frees up
     // some memory, and avoids trouble with the pass manager trying to call
     // verifyAnalysis on them.
     if (skipThisLoop)
-      for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {  
+      for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
         Pass *P = getContainedPass(Index);
         freePass(P, "<deleted>", ON_LOOP_MSG);
       }
 
     // Pop the loop from queue after running all passes.
     LQ.pop_back();
-    
+
     if (redoThisLoop)
       LQ.push_back(CurrentLoop);
   }
-  
+
   // Finalization
   for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
     LoopPass *P = getContainedPass(Index);
@@ -372,7 +328,7 @@ Pass *LoopPass::createPrinterPass(raw_ostream &O,
 // LPPassManger as expected.
 void LoopPass::preparePassManager(PMStack &PMS) {
 
-  // Find LPPassManager 
+  // Find LPPassManager
   while (!PMS.empty() &&
          PMS.top()->getPassManagerType() > PMT_LoopPassManager)
     PMS.pop();
@@ -381,14 +337,14 @@ void LoopPass::preparePassManager(PMStack &PMS) {
   // by other passes that are managed by LPM then do not insert
   // this pass in current LPM. Use new LPPassManager.
   if (PMS.top()->getPassManagerType() == PMT_LoopPassManager &&
-      !PMS.top()->preserveHigherLevelAnalysis(this)) 
+      !PMS.top()->preserveHigherLevelAnalysis(this))
     PMS.pop();
 }
 
 /// Assign pass manager to manage this pass.
 void LoopPass::assignPassManager(PMStack &PMS,
                                  PassManagerType PreferredType) {
-  // Find LPPassManager 
+  // Find LPPassManager
   while (!PMS.empty() &&
          PMS.top()->getPassManagerType() > PMT_LoopPassManager)
     PMS.pop();
@@ -397,12 +353,12 @@ void LoopPass::assignPassManager(PMStack &PMS,
   if (PMS.top()->getPassManagerType() == PMT_LoopPassManager)
     LPPM = (LPPassManager*)PMS.top();
   else {
-    // Create new Loop Pass Manager if it does not exist. 
+    // Create new Loop Pass Manager if it does not exist.
     assert (!PMS.empty() && "Unable to create Loop Pass Manager");
     PMDataManager *PMD = PMS.top();
 
-    // [1] Create new Call Graph Pass Manager
-    LPPM = new LPPassManager(PMD->getDepth() + 1);
+    // [1] Create new Loop Pass Manager
+    LPPM = new LPPassManager();
     LPPM->populateInheritedAnalysis(PMS);
 
     // [2] Set up new manager's top level manager
diff --git a/lib/Analysis/MemDepPrinter.cpp b/lib/Analysis/MemDepPrinter.cpp
index 2283db0..fde07ea 100644
--- a/lib/Analysis/MemDepPrinter.cpp
+++ b/lib/Analysis/MemDepPrinter.cpp
@@ -25,8 +25,17 @@ namespace {
   struct MemDepPrinter : public FunctionPass {
     const Function *F;
 
-    typedef PointerIntPair<const Instruction *, 1> InstAndClobberFlag;
-    typedef std::pair<InstAndClobberFlag, const BasicBlock *> Dep;
+    enum DepType {
+      Clobber = 0,
+      Def,
+      NonFuncLocal,
+      Unknown
+    };
+
+    static const char* DepTypeStr[];
+
+    typedef PointerIntPair<const Instruction *, 2, DepType> InstTypePair;
+    typedef std::pair<InstTypePair, const BasicBlock *> Dep;
     typedef SmallSetVector<Dep, 4> DepSet;
     typedef DenseMap<const Instruction *, DepSet> DepSetMap;
     DepSetMap Deps;
@@ -50,6 +59,21 @@ namespace {
       Deps.clear();
       F = 0;
     }
+
+  private:
+    static InstTypePair getInstTypePair(MemDepResult dep) {
+      if (dep.isClobber())
+        return InstTypePair(dep.getInst(), Clobber);
+      if (dep.isDef())
+        return InstTypePair(dep.getInst(), Def);
+      if (dep.isNonFuncLocal())
+        return InstTypePair(dep.getInst(), NonFuncLocal);
+      assert(dep.isUnknown() && "unexptected dependence type");
+      return InstTypePair(dep.getInst(), Unknown);
+    }
+    static InstTypePair getInstTypePair(const Instruction* inst, DepType type) {
+      return InstTypePair(inst, type);
+    }
   };
 }
 
@@ -64,6 +88,9 @@ FunctionPass *llvm::createMemDepPrinter() {
   return new MemDepPrinter();
 }
 
+const char* MemDepPrinter::DepTypeStr[]
+  = {"Clobber", "Def", "NonFuncLocal", "Unknown"};
+
 bool MemDepPrinter::runOnFunction(Function &F) {
   this->F = &F;
   AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
@@ -79,10 +106,7 @@ bool MemDepPrinter::runOnFunction(Function &F) {
 
     MemDepResult Res = MDA.getDependency(Inst);
     if (!Res.isNonLocal()) {
-      assert((Res.isUnknown() || Res.isClobber() || Res.isDef()) &&
-              "Local dep should be unknown, def or clobber!");
-      Deps[Inst].insert(std::make_pair(InstAndClobberFlag(Res.getInst(),
-                                                          Res.isClobber()),
+      Deps[Inst].insert(std::make_pair(getInstTypePair(Res),
                                        static_cast<BasicBlock *>(0)));
     } else if (CallSite CS = cast<Value>(Inst)) {
       const MemoryDependenceAnalysis::NonLocalDepInfo &NLDI =
@@ -92,22 +116,26 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       for (MemoryDependenceAnalysis::NonLocalDepInfo::const_iterator
            I = NLDI.begin(), E = NLDI.end(); I != E; ++I) {
         const MemDepResult &Res = I->getResult();
-        assert((Res.isUnknown() || Res.isClobber() || Res.isDef()) &&
-                "Resolved non-local call dep should be unknown, def or "
-                "clobber!");
-        InstDeps.insert(std::make_pair(InstAndClobberFlag(Res.getInst(),
-                                                          Res.isClobber()),
-                                       I->getBB()));
+        InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB()));
       }
     } else {
       SmallVector<NonLocalDepResult, 4> NLDI;
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        // FIXME: Volatile is not handled properly here.
+        if (!LI->isUnordered()) {
+          // FIXME: Handle atomic/volatile loads.
+          Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown),
+                                           static_cast<BasicBlock *>(0)));
+          continue;
+        }
         AliasAnalysis::Location Loc = AA.getLocation(LI);
-        MDA.getNonLocalPointerDependency(Loc, !LI->isVolatile(),
-                                         LI->getParent(), NLDI);
+        MDA.getNonLocalPointerDependency(Loc, true, LI->getParent(), NLDI);
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-        // FIXME: Volatile is not handled properly here.
+        if (!LI->isUnordered()) {
+          // FIXME: Handle atomic/volatile stores.
+          Deps[Inst].insert(std::make_pair(getInstTypePair(0, Unknown),
+                                           static_cast<BasicBlock *>(0)));
+          continue;
+        }
         AliasAnalysis::Location Loc = AA.getLocation(SI);
         MDA.getNonLocalPointerDependency(Loc, false, SI->getParent(), NLDI);
       } else if (VAArgInst *VI = dyn_cast<VAArgInst>(Inst)) {
@@ -121,11 +149,7 @@ bool MemDepPrinter::runOnFunction(Function &F) {
       for (SmallVectorImpl<NonLocalDepResult>::const_iterator
            I = NLDI.begin(), E = NLDI.end(); I != E; ++I) {
         const MemDepResult &Res = I->getResult();
-        assert(Res.isClobber() != Res.isDef() &&
-               "Resolved non-local pointer dep should be def or clobber!");
-        InstDeps.insert(std::make_pair(InstAndClobberFlag(Res.getInst(),
-                                                          Res.isClobber()),
-                                       I->getBB()));
+        InstDeps.insert(std::make_pair(getInstTypePair(Res), I->getBB()));
       }
     }
   }
@@ -146,26 +170,18 @@ void MemDepPrinter::print(raw_ostream &OS, const Module *M) const {
     for (DepSet::const_iterator I = InstDeps.begin(), E = InstDeps.end();
          I != E; ++I) {
       const Instruction *DepInst = I->first.getPointer();
-      bool isClobber = I->first.getInt();
+      DepType type = I->first.getInt();
       const BasicBlock *DepBB = I->second;
 
       OS << "    ";
-      if (!DepInst)
-        OS << "Unknown";
-      else if (isClobber)
-        OS << "Clobber";
-      else
-        OS << "    Def";
+      OS << DepTypeStr[type];
       if (DepBB) {
         OS << " in block ";
         WriteAsOperand(OS, DepBB, /*PrintType=*/false, M);
       }
       if (DepInst) {
         OS << " from: ";
-        if (DepInst == Inst)
-          OS << "<unspecified>";
-        else
-          DepInst->print(OS);
+        DepInst->print(OS);
       }
       OS << "\n";
     }
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 53d4304..8d451c4 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -47,7 +47,7 @@ static bool isMallocCall(const CallInst *CI) {
   // Check malloc prototype.
   // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin 
   // attribute will exist.
-  const FunctionType *FTy = Callee->getFunctionType();
+  FunctionType *FTy = Callee->getFunctionType();
   if (FTy->getNumParams() != 1)
     return false;
   return FTy->getParamType(0)->isIntegerTy(32) ||
@@ -94,12 +94,12 @@ static Value *computeArraySize(const CallInst *CI, const TargetData *TD,
     return NULL;
 
   // The size of the malloc's result type must be known to determine array size.
-  const Type *T = getMallocAllocatedType(CI);
+  Type *T = getMallocAllocatedType(CI);
   if (!T || !T->isSized() || !TD)
     return NULL;
 
   unsigned ElementSize = TD->getTypeAllocSize(T);
-  if (const StructType *ST = dyn_cast<StructType>(T))
+  if (StructType *ST = dyn_cast<StructType>(T))
     ElementSize = TD->getStructLayout(ST)->getSizeInBytes();
 
   // If malloc call's arg can be determined to be a multiple of ElementSize,
@@ -133,10 +133,10 @@ const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) {
 ///   0: PointerType is the calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-const PointerType *llvm::getMallocType(const CallInst *CI) {
+PointerType *llvm::getMallocType(const CallInst *CI) {
   assert(isMalloc(CI) && "getMallocType and not malloc call");
   
-  const PointerType *MallocType = NULL;
+  PointerType *MallocType = NULL;
   unsigned NumOfBitCastUses = 0;
 
   // Determine if CallInst has a bitcast use.
@@ -164,8 +164,8 @@ const PointerType *llvm::getMallocType(const CallInst *CI) {
 ///   0: PointerType is the malloc calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-const Type *llvm::getMallocAllocatedType(const CallInst *CI) {
-  const PointerType *PT = getMallocType(CI);
+Type *llvm::getMallocAllocatedType(const CallInst *CI) {
+  PointerType *PT = getMallocType(CI);
   return PT ? PT->getElementType() : NULL;
 }
 
@@ -201,7 +201,7 @@ const CallInst *llvm::isFreeCall(const Value *I) {
   // Check free prototype.
   // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin 
   // attribute will exist.
-  const FunctionType *FTy = Callee->getFunctionType();
+  FunctionType *FTy = Callee->getFunctionType();
   if (!FTy->getReturnType()->isVoidTy())
     return 0;
   if (FTy->getNumParams() != 1)
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index bba4482..92967c0 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -120,21 +120,27 @@ AliasAnalysis::ModRefResult GetLocation(const Instruction *Inst,
                                         AliasAnalysis::Location &Loc,
                                         AliasAnalysis *AA) {
   if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-    if (LI->isVolatile()) {
-      Loc = AliasAnalysis::Location();
+    if (LI->isUnordered()) {
+      Loc = AA->getLocation(LI);
+      return AliasAnalysis::Ref;
+    } else if (LI->getOrdering() == Monotonic) {
+      Loc = AA->getLocation(LI);
       return AliasAnalysis::ModRef;
     }
-    Loc = AA->getLocation(LI);
-    return AliasAnalysis::Ref;
+    Loc = AliasAnalysis::Location();
+    return AliasAnalysis::ModRef;
   }
 
   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    if (SI->isVolatile()) {
-      Loc = AliasAnalysis::Location();
+    if (SI->isUnordered()) {
+      Loc = AA->getLocation(SI);
+      return AliasAnalysis::Mod;
+    } else if (SI->getOrdering() == Monotonic) {
+      Loc = AA->getLocation(SI);
       return AliasAnalysis::ModRef;
     }
-    Loc = AA->getLocation(SI);
-    return AliasAnalysis::Mod;
+    Loc = AliasAnalysis::Location();
+    return AliasAnalysis::ModRef;
   }
 
   if (const VAArgInst *V = dyn_cast<VAArgInst>(Inst)) {
@@ -232,7 +238,7 @@ getCallSiteDependencyFrom(CallSite CS, bool isReadOnlyCall,
   // unknown, otherwise it is non-local.
   if (BB != &BB->getParent()->getEntryBlock())
     return MemDepResult::getNonLocal();
-  return MemDepResult::getUnknown();
+  return MemDepResult::getNonFuncLocal();
 }
 
 /// isLoadLoadClobberIfExtendedToFullWidth - Return true if LI is a load that
@@ -270,8 +276,8 @@ unsigned MemoryDependenceAnalysis::
 getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
                                 unsigned MemLocSize, const LoadInst *LI,
                                 const TargetData &TD) {
-  // We can only extend non-volatile integer loads.
-  if (!isa<IntegerType>(LI->getType()) || LI->isVolatile()) return 0;
+  // We can only extend simple integer loads.
+  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple()) return 0;
   
   // Get the base of this load.
   int64_t LIOffs = 0;
@@ -369,6 +375,11 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     // Values depend on loads if the pointers are must aliased.  This means that
     // a load depends on another must aliased load from the same value.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+      // Atomic loads have complications involved.
+      // FIXME: This is overly conservative.
+      if (!LI->isUnordered())
+        return MemDepResult::getClobber(LI);
+
       AliasAnalysis::Location LoadLoc = AA->getLocation(LI);
       
       // If we found a pointer, check if it could be the same as our pointer.
@@ -382,7 +393,7 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
           // location is 1 byte at P+1).  If so, return it as a load/load
           // clobber result, allowing the client to decide to widen the load if
           // it wants to.
-          if (const IntegerType *ITy = dyn_cast<IntegerType>(LI->getType()))
+          if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType()))
             if (LI->getAlignment()*8 > ITy->getPrimitiveSizeInBits() &&
                 isLoadLoadClobberIfExtendedToFullWidth(MemLoc, MemLocBase,
                                                        MemLocOffset, LI, TD))
@@ -424,6 +435,11 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     }
     
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      // Atomic stores have complications involved.
+      // FIXME: This is overly conservative.
+      if (!SI->isUnordered())
+        return MemDepResult::getClobber(SI);
+
       // If alias analysis can tell that this store is guaranteed to not modify
       // the query pointer, ignore it.  Use getModRefInfo to handle cases where
       // the query pointer points to constant memory etc.
@@ -483,7 +499,7 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
   // unknown, otherwise it is non-local.
   if (BB != &BB->getParent()->getEntryBlock())
     return MemDepResult::getNonLocal();
-  return MemDepResult::getUnknown();
+  return MemDepResult::getNonFuncLocal();
 }
 
 /// getDependency - Return the instruction on which a memory operation
@@ -516,7 +532,7 @@ MemDepResult MemoryDependenceAnalysis::getDependency(Instruction *QueryInst) {
     if (QueryParent != &QueryParent->getParent()->getEntryBlock())
       LocalCache = MemDepResult::getNonLocal();
     else
-      LocalCache = MemDepResult::getUnknown();
+      LocalCache = MemDepResult::getNonFuncLocal();
   } else {
     AliasAnalysis::Location MemLoc;
     AliasAnalysis::ModRefResult MR = GetLocation(QueryInst, MemLoc, AA);
@@ -672,7 +688,7 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
       // a clobber, otherwise it is unknown.
       Dep = MemDepResult::getNonLocal();
     } else {
-      Dep = MemDepResult::getUnknown();
+      Dep = MemDepResult::getNonFuncLocal();
     }
     
     // If we had a dirty entry for the block, update it.  Otherwise, just add
@@ -790,7 +806,7 @@ GetNonLocalInfoForBlock(const AliasAnalysis::Location &Loc,
   // If the block has a dependency (i.e. it isn't completely transparent to
   // the value), remember the reverse association because we just added it
   // to Cache!
-  if (Dep.isNonLocal() || Dep.isUnknown())
+  if (!Dep.isDef() && !Dep.isClobber())
     return Dep;
   
   // Keep the ReverseNonLocalPtrDeps map up to date so we can efficiently
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index 70dcd0d..7e22ddc 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -228,7 +228,7 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(&GEPOps[0], GEPOps.size(), TD, DT)) {
+    if (Value *V = SimplifyGEPInst(GEPOps, TD, DT)) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
 
@@ -407,9 +407,9 @@ InsertPHITranslatedSubExpr(Value *InVal, BasicBlock *CurBB,
     }
 
     GetElementPtrInst *Result =
-    GetElementPtrInst::Create(GEPOps[0], GEPOps.begin()+1, GEPOps.end(),
-                              InVal->getName()+".phi.trans.insert",
-                              PredBB->getTerminator());
+      GetElementPtrInst::Create(GEPOps[0], makeArrayRef(GEPOps).slice(1),
+                                InVal->getName()+".phi.trans.insert",
+                                PredBB->getTerminator());
     Result->setIsInBounds(GEP->isInBounds());
     NewInsts.push_back(Result);
     return Result;
diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp
index 7c584da..0e3b6e6 100644
--- a/lib/Analysis/PathNumbering.cpp
+++ b/lib/Analysis/PathNumbering.cpp
@@ -387,7 +387,7 @@ void BallLarusDag::buildNode(BLBlockNodeMap& inDag, BLNodeStack& dfsStack) {
 
     TerminatorInst* terminator = currentNode->getBlock()->getTerminator();
     if(isa<ReturnInst>(terminator) || isa<UnreachableInst>(terminator)
-       || isa<UnwindInst>(terminator))
+       || isa<ResumeInst>(terminator) || isa<UnwindInst>(terminator))
       addEdge(currentNode, getExit(),0);
 
     currentNode->setColor(BallLarusNode::GRAY);
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 80eda79..3a3529b 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -27,8 +27,8 @@ using namespace llvm;
 
 char RGPassManager::ID = 0;
 
-RGPassManager::RGPassManager(int Depth)
-  : FunctionPass(ID), PMDataManager(Depth) {
+RGPassManager::RGPassManager()
+  : FunctionPass(ID), PMDataManager() {
   skipThisRegion = false;
   redoThisRegion = false;
   RI = NULL;
@@ -250,7 +250,7 @@ void RegionPass::assignPassManager(PMStack &PMS,
     PMDataManager *PMD = PMS.top();
 
     // [1] Create new Region Pass Manager
-    RGPM = new RGPassManager(PMD->getDepth() + 1);
+    RGPM = new RGPassManager();
     RGPM->populateInheritedAnalysis(PMS);
 
     // [2] Set up new manager's top level manager
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 025718e..e0ac56c 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -197,7 +197,7 @@ void SCEV::print(raw_ostream &OS) const {
   }
   case scUnknown: {
     const SCEVUnknown *U = cast<SCEVUnknown>(this);
-    const Type *AllocTy;
+    Type *AllocTy;
     if (U->isSizeOf(AllocTy)) {
       OS << "sizeof(" << *AllocTy << ")";
       return;
@@ -207,7 +207,7 @@ void SCEV::print(raw_ostream &OS) const {
       return;
     }
 
-    const Type *CTy;
+    Type *CTy;
     Constant *FieldNo;
     if (U->isOffsetOf(CTy, FieldNo)) {
       OS << "offsetof(" << *CTy << ", ";
@@ -228,7 +228,7 @@ void SCEV::print(raw_ostream &OS) const {
   llvm_unreachable("Unknown SCEV kind!");
 }
 
-const Type *SCEV::getType() const {
+Type *SCEV::getType() const {
   switch (getSCEVType()) {
   case scConstant:
     return cast<SCEVConstant>(this)->getType();
@@ -297,17 +297,17 @@ const SCEV *ScalarEvolution::getConstant(const APInt& Val) {
 }
 
 const SCEV *
-ScalarEvolution::getConstant(const Type *Ty, uint64_t V, bool isSigned) {
-  const IntegerType *ITy = cast<IntegerType>(getEffectiveSCEVType(Ty));
+ScalarEvolution::getConstant(Type *Ty, uint64_t V, bool isSigned) {
+  IntegerType *ITy = cast<IntegerType>(getEffectiveSCEVType(Ty));
   return getConstant(ConstantInt::get(ITy, V, isSigned));
 }
 
 SCEVCastExpr::SCEVCastExpr(const FoldingSetNodeIDRef ID,
-                           unsigned SCEVTy, const SCEV *op, const Type *ty)
+                           unsigned SCEVTy, const SCEV *op, Type *ty)
   : SCEV(ID, SCEVTy), Op(op), Ty(ty) {}
 
 SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID,
-                                   const SCEV *op, const Type *ty)
+                                   const SCEV *op, Type *ty)
   : SCEVCastExpr(ID, scTruncate, op, ty) {
   assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
@@ -315,7 +315,7 @@ SCEVTruncateExpr::SCEVTruncateExpr(const FoldingSetNodeIDRef ID,
 }
 
 SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID,
-                                       const SCEV *op, const Type *ty)
+                                       const SCEV *op, Type *ty)
   : SCEVCastExpr(ID, scZeroExtend, op, ty) {
   assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
@@ -323,7 +323,7 @@ SCEVZeroExtendExpr::SCEVZeroExtendExpr(const FoldingSetNodeIDRef ID,
 }
 
 SCEVSignExtendExpr::SCEVSignExtendExpr(const FoldingSetNodeIDRef ID,
-                                       const SCEV *op, const Type *ty)
+                                       const SCEV *op, Type *ty)
   : SCEVCastExpr(ID, scSignExtend, op, ty) {
   assert((Op->getType()->isIntegerTy() || Op->getType()->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
@@ -354,7 +354,7 @@ void SCEVUnknown::allUsesReplacedWith(Value *New) {
   setValPtr(New);
 }
 
-bool SCEVUnknown::isSizeOf(const Type *&AllocTy) const {
+bool SCEVUnknown::isSizeOf(Type *&AllocTy) const {
   if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
     if (VCE->getOpcode() == Instruction::PtrToInt)
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
@@ -371,15 +371,15 @@ bool SCEVUnknown::isSizeOf(const Type *&AllocTy) const {
   return false;
 }
 
-bool SCEVUnknown::isAlignOf(const Type *&AllocTy) const {
+bool SCEVUnknown::isAlignOf(Type *&AllocTy) const {
   if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
     if (VCE->getOpcode() == Instruction::PtrToInt)
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
         if (CE->getOpcode() == Instruction::GetElementPtr &&
             CE->getOperand(0)->isNullValue()) {
-          const Type *Ty =
+          Type *Ty =
             cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
-          if (const StructType *STy = dyn_cast<StructType>(Ty))
+          if (StructType *STy = dyn_cast<StructType>(Ty))
             if (!STy->isPacked() &&
                 CE->getNumOperands() == 3 &&
                 CE->getOperand(1)->isNullValue()) {
@@ -396,7 +396,7 @@ bool SCEVUnknown::isAlignOf(const Type *&AllocTy) const {
   return false;
 }
 
-bool SCEVUnknown::isOffsetOf(const Type *&CTy, Constant *&FieldNo) const {
+bool SCEVUnknown::isOffsetOf(Type *&CTy, Constant *&FieldNo) const {
   if (ConstantExpr *VCE = dyn_cast<ConstantExpr>(getValue()))
     if (VCE->getOpcode() == Instruction::PtrToInt)
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(VCE->getOperand(0)))
@@ -404,7 +404,7 @@ bool SCEVUnknown::isOffsetOf(const Type *&CTy, Constant *&FieldNo) const {
             CE->getNumOperands() == 3 &&
             CE->getOperand(0)->isNullValue() &&
             CE->getOperand(1)->isNullValue()) {
-          const Type *Ty =
+          Type *Ty =
             cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
           // Ignore vector types here so that ScalarEvolutionExpander doesn't
           // emit getelementptrs that index into vectors.
@@ -652,7 +652,7 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
 /// Assume, K > 0.
 static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
                                        ScalarEvolution &SE,
-                                       const Type* ResultTy) {
+                                       Type *ResultTy) {
   // Handle the simplest case efficiently.
   if (K == 1)
     return SE.getTruncateOrZeroExtend(It, ResultTy);
@@ -742,7 +742,7 @@ static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
   MultiplyFactor = MultiplyFactor.trunc(W);
 
   // Calculate the product, at width T+W
-  const IntegerType *CalculationTy = IntegerType::get(SE.getContext(),
+  IntegerType *CalculationTy = IntegerType::get(SE.getContext(),
                                                       CalculationBits);
   const SCEV *Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy);
   for (unsigned i = 1; i != K; ++i) {
@@ -790,7 +790,7 @@ const SCEV *SCEVAddRecExpr::evaluateAtIteration(const SCEV *It,
 //===----------------------------------------------------------------------===//
 
 const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
-                                             const Type *Ty) {
+                                             Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) > getTypeSizeInBits(Ty) &&
          "This is not a truncating conversion!");
   assert(isSCEVable(Ty) &&
@@ -877,7 +877,7 @@ const SCEV *ScalarEvolution::getTruncateExpr(const SCEV *Op,
 }
 
 const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
-                                               const Type *Ty) {
+                                               Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -954,7 +954,7 @@ const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op,
         const SCEV *RecastedMaxBECount =
           getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
         if (MaxBECount == RecastedMaxBECount) {
-          const Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
+          Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no unsigned overflow.
           const SCEV *ZMul = getMulExpr(CastedMaxBECount, Step);
           const SCEV *Add = getAddExpr(Start, ZMul);
@@ -1062,7 +1062,7 @@ static const SCEV *getOverflowLimitForStep(const SCEV *Step,
 // result, the expression "Step + sext(PreIncAR)" is congruent with
 // "sext(PostIncAR)"
 static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
-                                            const Type *Ty,
+                                            Type *Ty,
                                             ScalarEvolution *SE) {
   const Loop *L = AR->getLoop();
   const SCEV *Start = AR->getStart();
@@ -1070,14 +1070,26 @@ static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
 
   // Check for a simple looking step prior to loop entry.
   const SCEVAddExpr *SA = dyn_cast<SCEVAddExpr>(Start);
-  if (!SA || SA->getNumOperands() != 2 || SA->getOperand(0) != Step)
+  if (!SA)
+    return 0;
+
+  // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV
+  // subtraction is expensive. For this purpose, perform a quick and dirty
+  // difference, by checking for Step in the operand list.
+  SmallVector<const SCEV *, 4> DiffOps;
+  for (SCEVAddExpr::op_iterator I = SA->op_begin(), E = SA->op_end();
+       I != E; ++I) {
+    if (*I != Step)
+      DiffOps.push_back(*I);
+  }
+  if (DiffOps.size() == SA->getNumOperands())
     return 0;
 
   // This is a postinc AR. Check for overflow on the preinc recurrence using the
   // same three conditions that getSignExtendedExpr checks.
 
   // 1. NSW flags on the step increment.
-  const SCEV *PreStart = SA->getOperand(1);
+  const SCEV *PreStart = SE->getAddExpr(DiffOps, SA->getNoWrapFlags());
   const SCEVAddRecExpr *PreAR = dyn_cast<SCEVAddRecExpr>(
     SE->getAddRecExpr(PreStart, Step, L, SCEV::FlagAnyWrap));
 
@@ -1086,7 +1098,7 @@ static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
 
   // 2. Direct overflow check on the step operation's expression.
   unsigned BitWidth = SE->getTypeSizeInBits(AR->getType());
-  const Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
+  Type *WideTy = IntegerType::get(SE->getContext(), BitWidth * 2);
   const SCEV *OperandExtendedStart =
     SE->getAddExpr(SE->getSignExtendExpr(PreStart, WideTy),
                    SE->getSignExtendExpr(Step, WideTy));
@@ -1112,7 +1124,7 @@ static const SCEV *getPreStartForSignExtend(const SCEVAddRecExpr *AR,
 
 // Get the normalized sign-extended expression for this AddRec's Start.
 static const SCEV *getSignExtendAddRecStart(const SCEVAddRecExpr *AR,
-                                            const Type *Ty,
+                                            Type *Ty,
                                             ScalarEvolution *SE) {
   const SCEV *PreStart = getPreStartForSignExtend(AR, Ty, SE);
   if (!PreStart)
@@ -1123,7 +1135,7 @@ static const SCEV *getSignExtendAddRecStart(const SCEVAddRecExpr *AR,
 }
 
 const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
-                                               const Type *Ty) {
+                                               Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -1208,7 +1220,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
         const SCEV *RecastedMaxBECount =
           getTruncateOrZeroExtend(CastedMaxBECount, MaxBECount->getType());
         if (MaxBECount == RecastedMaxBECount) {
-          const Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
+          Type *WideTy = IntegerType::get(getContext(), BitWidth * 2);
           // Check whether Start+Step*MaxBECount has no signed overflow.
           const SCEV *SMul = getMulExpr(CastedMaxBECount, Step);
           const SCEV *Add = getAddExpr(Start, SMul);
@@ -1275,7 +1287,7 @@ const SCEV *ScalarEvolution::getSignExtendExpr(const SCEV *Op,
 /// unspecified bits out to the given type.
 ///
 const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
-                                              const Type *Ty) {
+                                              Type *Ty) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -1438,7 +1450,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   assert(!Ops.empty() && "Cannot get empty add!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
-  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVAddExpr operand types don't match!");
@@ -1488,7 +1500,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   // Okay, check to see if the same value occurs in the operand list more than
   // once.  If so, merge them together into an multiply expression.  Since we
   // sorted the list, these values are required to be adjacent.
-  const Type *Ty = Ops[0]->getType();
+  Type *Ty = Ops[0]->getType();
   bool FoundMatch = false;
   for (unsigned i = 0, e = Ops.size(); i != e-1; ++i)
     if (Ops[i] == Ops[i+1]) {      //  X + Y + Y  -->  X + Y*2
@@ -1515,8 +1527,8 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   // if the contents of the resulting outer trunc fold to something simple.
   for (; Idx < Ops.size() && isa<SCEVTruncateExpr>(Ops[Idx]); ++Idx) {
     const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(Ops[Idx]);
-    const Type *DstType = Trunc->getType();
-    const Type *SrcType = Trunc->getOperand()->getType();
+    Type *DstType = Trunc->getType();
+    Type *SrcType = Trunc->getOperand()->getType();
     SmallVector<const SCEV *, 8> LargeOps;
     bool Ok = true;
     // Check all the operands to see if they can be represented in the
@@ -1735,7 +1747,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       // If all of the other operands were loop invariant, we are done.
       if (Ops.size() == 1) return NewRec;
 
-      // Otherwise, add the folded AddRec by the non-liv parts.
+      // Otherwise, add the folded AddRec by the non-invariant parts.
       for (unsigned i = 0;; ++i)
         if (Ops[i] == AddRec) {
           Ops[i] = NewRec;
@@ -1800,6 +1812,38 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   return S;
 }
 
+static uint64_t umul_ov(uint64_t i, uint64_t j, bool &Overflow) {
+  uint64_t k = i*j;
+  if (j > 1 && k / j != i) Overflow = true;
+  return k;
+}
+
+/// Compute the result of "n choose k", the binomial coefficient.  If an
+/// intermediate computation overflows, Overflow will be set and the return will
+/// be garbage. Overflow is not cleared on absense of overflow.
+static uint64_t Choose(uint64_t n, uint64_t k, bool &Overflow) {
+  // We use the multiplicative formula:
+  //     n(n-1)(n-2)...(n-(k-1)) / k(k-1)(k-2)...1 .
+  // At each iteration, we take the n-th term of the numeral and divide by the
+  // (k-n)th term of the denominator.  This division will always produce an
+  // integral result, and helps reduce the chance of overflow in the
+  // intermediate computations. However, we can still overflow even when the
+  // final result would fit.
+
+  if (n == 0 || n == k) return 1;
+  if (k > n) return 0;
+
+  if (k > n/2)
+    k = n-k;
+
+  uint64_t r = 1;
+  for (uint64_t i = 1; i <= k; ++i) {
+    r = umul_ov(r, n-(i-1), Overflow);
+    r /= i;
+  }
+  return r;
+}
+
 /// getMulExpr - Get a canonical multiply expression, or something simpler if
 /// possible.
 const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
@@ -1809,7 +1853,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
   assert(!Ops.empty() && "Cannot get empty mul!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
-  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVMulExpr operand types don't match!");
@@ -1960,7 +2004,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       // If all of the other operands were loop invariant, we are done.
       if (Ops.size() == 1) return NewRec;
 
-      // Otherwise, multiply the folded AddRec by the non-liv parts.
+      // Otherwise, multiply the folded AddRec by the non-invariant parts.
       for (unsigned i = 0;; ++i)
         if (Ops[i] == AddRec) {
           Ops[i] = NewRec;
@@ -1974,31 +2018,65 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     // multiplied together.  If so, we can fold them.
     for (unsigned OtherIdx = Idx+1;
          OtherIdx < Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
-         ++OtherIdx)
+         ++OtherIdx) {
       if (AddRecLoop == cast<SCEVAddRecExpr>(Ops[OtherIdx])->getLoop()) {
-        // F * G, where F = {A,+,B}<L> and G = {C,+,D}<L>  -->
-        // {A*C,+,F*D + G*B + B*D}<L>
+        // {A1,+,A2,+,...,+,An}<L> * {B1,+,B2,+,...,+,Bn}<L>
+        // = {x=1 in [ sum y=x..2x [ sum z=max(y-x, y-n)..min(x,n) [
+        //       choose(x, 2x)*choose(2x-y, x-z)*A_{y-z}*B_z
+        //   ]]],+,...up to x=2n}.
+        // Note that the arguments to choose() are always integers with values
+        // known at compile time, never SCEV objects.
+        //
+        // The implementation avoids pointless extra computations when the two
+        // addrec's are of different length (mathematically, it's equivalent to
+        // an infinite stream of zeros on the right).
+        bool OpsModified = false;
         for (; OtherIdx != Ops.size() && isa<SCEVAddRecExpr>(Ops[OtherIdx]);
              ++OtherIdx)
           if (const SCEVAddRecExpr *OtherAddRec =
                 dyn_cast<SCEVAddRecExpr>(Ops[OtherIdx]))
             if (OtherAddRec->getLoop() == AddRecLoop) {
-              const SCEVAddRecExpr *F = AddRec, *G = OtherAddRec;
-              const SCEV *NewStart = getMulExpr(F->getStart(), G->getStart());
-              const SCEV *B = F->getStepRecurrence(*this);
-              const SCEV *D = G->getStepRecurrence(*this);
-              const SCEV *NewStep = getAddExpr(getMulExpr(F, D),
-                                               getMulExpr(G, B),
-                                               getMulExpr(B, D));
-              const SCEV *NewAddRec = getAddRecExpr(NewStart, NewStep,
-                                                    F->getLoop(),
-                                                    SCEV::FlagAnyWrap);
-              if (Ops.size() == 2) return NewAddRec;
-              Ops[Idx] = AddRec = cast<SCEVAddRecExpr>(NewAddRec);
-              Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+              bool Overflow = false;
+              Type *Ty = AddRec->getType();
+              bool LargerThan64Bits = getTypeSizeInBits(Ty) > 64;
+              SmallVector<const SCEV*, 7> AddRecOps;
+              for (int x = 0, xe = AddRec->getNumOperands() +
+                     OtherAddRec->getNumOperands() - 1;
+                   x != xe && !Overflow; ++x) {
+                const SCEV *Term = getConstant(Ty, 0);
+                for (int y = x, ye = 2*x+1; y != ye && !Overflow; ++y) {
+                  uint64_t Coeff1 = Choose(x, 2*x - y, Overflow);
+                  for (int z = std::max(y-x, y-(int)AddRec->getNumOperands()+1),
+                         ze = std::min(x+1, (int)OtherAddRec->getNumOperands());
+                       z < ze && !Overflow; ++z) {
+                    uint64_t Coeff2 = Choose(2*x - y, x-z, Overflow);
+                    uint64_t Coeff;
+                    if (LargerThan64Bits)
+                      Coeff = umul_ov(Coeff1, Coeff2, Overflow);
+                    else
+                      Coeff = Coeff1*Coeff2;
+                    const SCEV *CoeffTerm = getConstant(Ty, Coeff);
+                    const SCEV *Term1 = AddRec->getOperand(y-z);
+                    const SCEV *Term2 = OtherAddRec->getOperand(z);
+                    Term = getAddExpr(Term, getMulExpr(CoeffTerm, Term1,Term2));
+                  }
+                }
+                AddRecOps.push_back(Term);
+              }
+              if (!Overflow) {
+                const SCEV *NewAddRec = getAddRecExpr(AddRecOps,
+                                                      AddRec->getLoop(),
+                                                      SCEV::FlagAnyWrap);
+                if (Ops.size() == 2) return NewAddRec;
+                Ops[Idx] = AddRec = cast<SCEVAddRecExpr>(NewAddRec);
+                Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
+                OpsModified = true;
+              }
             }
-        return getMulExpr(Ops);
+        if (OpsModified)
+          return getMulExpr(Ops);
       }
+    }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
     // next one.
@@ -2042,21 +2120,22 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
       // Determine if the division can be folded into the operands of
       // its operands.
       // TODO: Generalize this to non-constants by using known-bits information.
-      const Type *Ty = LHS->getType();
+      Type *Ty = LHS->getType();
       unsigned LZ = RHSC->getValue()->getValue().countLeadingZeros();
       unsigned MaxShiftAmt = getTypeSizeInBits(Ty) - LZ - 1;
       // For non-power-of-two values, effectively round the value up to the
       // nearest power of two.
       if (!RHSC->getValue()->getValue().isPowerOf2())
         ++MaxShiftAmt;
-      const IntegerType *ExtTy =
+      IntegerType *ExtTy =
         IntegerType::get(getContext(), getTypeSizeInBits(Ty) + MaxShiftAmt);
-      // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded.
       if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS))
         if (const SCEVConstant *Step =
-              dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this)))
-          if (!Step->getValue()->getValue()
-                .urem(RHSC->getValue()->getValue()) &&
+            dyn_cast<SCEVConstant>(AR->getStepRecurrence(*this))) {
+          // {X,+,N}/C --> {X/C,+,N/C} if safe and N/C can be folded.
+          const APInt &StepInt = Step->getValue()->getValue();
+          const APInt &DivInt = RHSC->getValue()->getValue();
+          if (!StepInt.urem(DivInt) &&
               getZeroExtendExpr(AR, ExtTy) ==
               getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
                             getZeroExtendExpr(Step, ExtTy),
@@ -2067,6 +2146,22 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
             return getAddRecExpr(Operands, AR->getLoop(),
                                  SCEV::FlagNW);
           }
+          /// Get a canonical UDivExpr for a recurrence.
+          /// {X,+,N}/C => {Y,+,N}/C where Y=X-(X%N). Safe when C%N=0.
+          // We can currently only fold X%N if X is constant.
+          const SCEVConstant *StartC = dyn_cast<SCEVConstant>(AR->getStart());
+          if (StartC && !DivInt.urem(StepInt) &&
+              getZeroExtendExpr(AR, ExtTy) ==
+              getAddRecExpr(getZeroExtendExpr(AR->getStart(), ExtTy),
+                            getZeroExtendExpr(Step, ExtTy),
+                            AR->getLoop(), SCEV::FlagAnyWrap)) {
+            const APInt &StartInt = StartC->getValue()->getValue();
+            const APInt &StartRem = StartInt.urem(StepInt);
+            if (StartRem != 0)
+              LHS = getAddRecExpr(getConstant(StartInt - StartRem), Step,
+                                  AR->getLoop(), SCEV::FlagNW);
+          }
+        }
       // (A*B)/C --> A*(B/C) if safe and B/C can be folded.
       if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(LHS)) {
         SmallVector<const SCEV *, 4> Operands;
@@ -2151,7 +2246,7 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
                                const Loop *L, SCEV::NoWrapFlags Flags) {
   if (Operands.size() == 1) return Operands[0];
 #ifndef NDEBUG
-  const Type *ETy = getEffectiveSCEVType(Operands[0]->getType());
+  Type *ETy = getEffectiveSCEVType(Operands[0]->getType());
   for (unsigned i = 1, e = Operands.size(); i != e; ++i)
     assert(getEffectiveSCEVType(Operands[i]->getType()) == ETy &&
            "SCEVAddRecExpr operand types don't match!");
@@ -2269,7 +2364,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   assert(!Ops.empty() && "Cannot get empty smax!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
-  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVSMaxExpr operand types don't match!");
@@ -2373,7 +2468,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl<const SCEV *> &Ops) {
   assert(!Ops.empty() && "Cannot get empty umax!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
-  const Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
            "SCEVUMaxExpr operand types don't match!");
@@ -2476,7 +2571,7 @@ const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
   return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
 }
 
-const SCEV *ScalarEvolution::getSizeOfExpr(const Type *AllocTy) {
+const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy) {
   // If we have TargetData, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
@@ -2488,20 +2583,20 @@ const SCEV *ScalarEvolution::getSizeOfExpr(const Type *AllocTy) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
       C = Folded;
-  const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
-const SCEV *ScalarEvolution::getAlignOfExpr(const Type *AllocTy) {
+const SCEV *ScalarEvolution::getAlignOfExpr(Type *AllocTy) {
   Constant *C = ConstantExpr::getAlignOf(AllocTy);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
       C = Folded;
-  const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
-const SCEV *ScalarEvolution::getOffsetOfExpr(const StructType *STy,
+const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy,
                                              unsigned FieldNo) {
   // If we have TargetData, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
@@ -2514,17 +2609,17 @@ const SCEV *ScalarEvolution::getOffsetOfExpr(const StructType *STy,
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
       C = Folded;
-  const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
-const SCEV *ScalarEvolution::getOffsetOfExpr(const Type *CTy,
+const SCEV *ScalarEvolution::getOffsetOfExpr(Type *CTy,
                                              Constant *FieldNo) {
   Constant *C = ConstantExpr::getOffsetOf(CTy, FieldNo);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD))
       C = Folded;
-  const Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(CTy));
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(CTy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
@@ -2558,14 +2653,14 @@ const SCEV *ScalarEvolution::getUnknown(Value *V) {
 /// the SCEV framework. This primarily includes integer types, and it
 /// can optionally include pointer types if the ScalarEvolution class
 /// has access to target-specific information.
-bool ScalarEvolution::isSCEVable(const Type *Ty) const {
+bool ScalarEvolution::isSCEVable(Type *Ty) const {
   // Integers and pointers are always SCEVable.
   return Ty->isIntegerTy() || Ty->isPointerTy();
 }
 
 /// getTypeSizeInBits - Return the size in bits of the specified type,
 /// for which isSCEVable must return true.
-uint64_t ScalarEvolution::getTypeSizeInBits(const Type *Ty) const {
+uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
   // If we have a TargetData, use it!
@@ -2586,7 +2681,7 @@ uint64_t ScalarEvolution::getTypeSizeInBits(const Type *Ty) const {
 /// the given type and which represents how SCEV will treat the given
 /// type, for which isSCEVable must return true. For pointer types,
 /// this is the pointer-sized integer type.
-const Type *ScalarEvolution::getEffectiveSCEVType(const Type *Ty) const {
+Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
   if (Ty->isIntegerTy())
@@ -2628,7 +2723,7 @@ const SCEV *ScalarEvolution::getNegativeSCEV(const SCEV *V) {
     return getConstant(
                cast<ConstantInt>(ConstantExpr::getNeg(VC->getValue())));
 
-  const Type *Ty = V->getType();
+  Type *Ty = V->getType();
   Ty = getEffectiveSCEVType(Ty);
   return getMulExpr(V,
                   getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty))));
@@ -2640,7 +2735,7 @@ const SCEV *ScalarEvolution::getNotSCEV(const SCEV *V) {
     return getConstant(
                 cast<ConstantInt>(ConstantExpr::getNot(VC->getValue())));
 
-  const Type *Ty = V->getType();
+  Type *Ty = V->getType();
   Ty = getEffectiveSCEVType(Ty);
   const SCEV *AllOnes =
                    getConstant(cast<ConstantInt>(Constant::getAllOnesValue(Ty)));
@@ -2664,8 +2759,8 @@ const SCEV *ScalarEvolution::getMinusSCEV(const SCEV *LHS, const SCEV *RHS,
 /// input value to the specified type.  If the type must be extended, it is zero
 /// extended.
 const SCEV *
-ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, const Type *Ty) {
-  const Type *SrcTy = V->getType();
+ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
   assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
          "Cannot truncate or zero extend with non-integer arguments!");
@@ -2681,8 +2776,8 @@ ScalarEvolution::getTruncateOrZeroExtend(const SCEV *V, const Type *Ty) {
 /// extended.
 const SCEV *
 ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
-                                         const Type *Ty) {
-  const Type *SrcTy = V->getType();
+                                         Type *Ty) {
+  Type *SrcTy = V->getType();
   assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
          "Cannot truncate or zero extend with non-integer arguments!");
@@ -2697,8 +2792,8 @@ ScalarEvolution::getTruncateOrSignExtend(const SCEV *V,
 /// input value to the specified type.  If the type must be extended, it is zero
 /// extended.  The conversion must not be narrowing.
 const SCEV *
-ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, const Type *Ty) {
-  const Type *SrcTy = V->getType();
+ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
   assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
          "Cannot noop or zero extend with non-integer arguments!");
@@ -2713,8 +2808,8 @@ ScalarEvolution::getNoopOrZeroExtend(const SCEV *V, const Type *Ty) {
 /// input value to the specified type.  If the type must be extended, it is sign
 /// extended.  The conversion must not be narrowing.
 const SCEV *
-ScalarEvolution::getNoopOrSignExtend(const SCEV *V, const Type *Ty) {
-  const Type *SrcTy = V->getType();
+ScalarEvolution::getNoopOrSignExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
   assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
          "Cannot noop or sign extend with non-integer arguments!");
@@ -2730,8 +2825,8 @@ ScalarEvolution::getNoopOrSignExtend(const SCEV *V, const Type *Ty) {
 /// it is extended with unspecified bits. The conversion must not be
 /// narrowing.
 const SCEV *
-ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, const Type *Ty) {
-  const Type *SrcTy = V->getType();
+ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
   assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
          "Cannot noop or any extend with non-integer arguments!");
@@ -2745,8 +2840,8 @@ ScalarEvolution::getNoopOrAnyExtend(const SCEV *V, const Type *Ty) {
 /// getTruncateOrNoop - Return a SCEV corresponding to a conversion of the
 /// input value to the specified type.  The conversion must not be widening.
 const SCEV *
-ScalarEvolution::getTruncateOrNoop(const SCEV *V, const Type *Ty) {
-  const Type *SrcTy = V->getType();
+ScalarEvolution::getTruncateOrNoop(const SCEV *V, Type *Ty) {
+  Type *SrcTy = V->getType();
   assert((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
          (Ty->isIntegerTy() || Ty->isPointerTy()) &&
          "Cannot truncate or noop with non-integer arguments!");
@@ -3032,7 +3127,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   // context.
   bool isInBounds = GEP->isInBounds();
 
-  const Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
+  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
   Value *Base = GEP->getOperand(0);
   // Don't attempt to analyze GEPs over unsized objects.
   if (!cast<PointerType>(Base->getType())->getElementType()->isSized())
@@ -3044,7 +3139,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
        I != E; ++I) {
     Value *Index = *I;
     // Compute the (potentially symbolic) offset in bytes for this index.
-    if (const StructType *STy = dyn_cast<StructType>(*GTI++)) {
+    if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
       // For a struct, add the member offset.
       unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
       const SCEV *FieldOffset = getOffsetOfExpr(STy, FieldNo);
@@ -3244,7 +3339,7 @@ ScalarEvolution::getUnsignedRange(const SCEV *S) {
 
     // TODO: non-affine addrec
     if (AddRec->isAffine()) {
-      const Type *Ty = AddRec->getType();
+      Type *Ty = AddRec->getType();
       const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop());
       if (!isa<SCEVCouldNotCompute>(MaxBECount) &&
           getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) {
@@ -3396,7 +3491,7 @@ ScalarEvolution::getSignedRange(const SCEV *S) {
 
     // TODO: non-affine addrec
     if (AddRec->isAffine()) {
-      const Type *Ty = AddRec->getType();
+      Type *Ty = AddRec->getType();
       const SCEV *MaxBECount = getMaxBackedgeTakenCount(AddRec->getLoop());
       if (!isa<SCEVCouldNotCompute>(MaxBECount) &&
           getTypeSizeInBits(MaxBECount->getType()) <= BitWidth) {
@@ -3503,7 +3598,13 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         AddOps.push_back(Op1);
     }
     AddOps.push_back(getSCEV(U->getOperand(0)));
-    return getAddExpr(AddOps);
+    SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
+    OverflowingBinaryOperator *OBO = cast<OverflowingBinaryOperator>(V);
+    if (OBO->hasNoSignedWrap())
+      setFlags(Flags, SCEV::FlagNSW);
+    if (OBO->hasNoUnsignedWrap())
+      setFlags(Flags, SCEV::FlagNUW);
+    return getAddExpr(AddOps, Flags);
   }
   case Instruction::Mul: {
     // See the Add code above.
@@ -3601,9 +3702,9 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
               LCI->getValue() == CI->getValue())
             if (const SCEVZeroExtendExpr *Z =
                   dyn_cast<SCEVZeroExtendExpr>(getSCEV(U->getOperand(0)))) {
-              const Type *UTy = U->getType();
+              Type *UTy = U->getType();
               const SCEV *Z0 = Z->getOperand();
-              const Type *Z0Ty = Z0->getType();
+              Type *Z0Ty = Z0->getType();
               unsigned Z0TySize = getTypeSizeInBits(Z0Ty);
 
               // If C is a low-bits mask, the zero extend is serving to
@@ -3813,6 +3914,70 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 //                   Iteration Count Computation Code
 //
 
+/// getSmallConstantTripCount - Returns the maximum trip count of this loop as a
+/// normal unsigned value, if possible. Returns 0 if the trip count is unknown
+/// or not constant. Will also return 0 if the maximum trip count is very large
+/// (>= 2^32)
+unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
+                                                    BasicBlock *ExitBlock) {
+  const SCEVConstant *ExitCount =
+    dyn_cast<SCEVConstant>(getExitCount(L, ExitBlock));
+  if (!ExitCount)
+    return 0;
+
+  ConstantInt *ExitConst = ExitCount->getValue();
+
+  // Guard against huge trip counts.
+  if (ExitConst->getValue().getActiveBits() > 32)
+    return 0;
+
+  // In case of integer overflow, this returns 0, which is correct.
+  return ((unsigned)ExitConst->getZExtValue()) + 1;
+}
+
+/// getSmallConstantTripMultiple - Returns the largest constant divisor of the
+/// trip count of this loop as a normal unsigned value, if possible. This
+/// means that the actual trip count is always a multiple of the returned
+/// value (don't forget the trip count could very well be zero as well!).
+///
+/// Returns 1 if the trip count is unknown or not guaranteed to be the
+/// multiple of a constant (which is also the case if the trip count is simply
+/// constant, use getSmallConstantTripCount for that case), Will also return 1
+/// if the trip count is very large (>= 2^32).
+unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
+                                                       BasicBlock *ExitBlock) {
+  const SCEV *ExitCount = getExitCount(L, ExitBlock);
+  if (ExitCount == getCouldNotCompute())
+    return 1;
+
+  // Get the trip count from the BE count by adding 1.
+  const SCEV *TCMul = getAddExpr(ExitCount,
+                                 getConstant(ExitCount->getType(), 1));
+  // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt
+  // to factor simple cases.
+  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(TCMul))
+    TCMul = Mul->getOperand(0);
+
+  const SCEVConstant *MulC = dyn_cast<SCEVConstant>(TCMul);
+  if (!MulC)
+    return 1;
+
+  ConstantInt *Result = MulC->getValue();
+
+  // Guard against huge trip counts.
+  if (!Result || Result->getValue().getActiveBits() > 32)
+    return 1;
+
+  return (unsigned)Result->getZExtValue();
+}
+
+// getExitCount - Get the expression for the number of loop iterations for which
+// this loop is guaranteed not to exit via ExitintBlock. Otherwise return
+// SCEVCouldNotCompute.
+const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
+  return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
+}
+
 /// getBackedgeTakenCount - If the specified loop has a predictable
 /// backedge-taken count, return it, otherwise return a SCEVCouldNotCompute
 /// object. The backedge-taken count is the number of times the loop header
@@ -3825,14 +3990,14 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 /// hasLoopInvariantBackedgeTakenCount).
 ///
 const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L) {
-  return getBackedgeTakenInfo(L).Exact;
+  return getBackedgeTakenInfo(L).getExact(this);
 }
 
 /// getMaxBackedgeTakenCount - Similar to getBackedgeTakenCount, except
 /// return the least SCEV value that is known never to be less than the
 /// actual backedge taken count.
 const SCEV *ScalarEvolution::getMaxBackedgeTakenCount(const Loop *L) {
-  return getBackedgeTakenInfo(L).Max;
+  return getBackedgeTakenInfo(L).getMax(this);
 }
 
 /// PushLoopPHIs - Push PHI nodes in the header of the given loop
@@ -3849,33 +4014,31 @@ PushLoopPHIs(const Loop *L, SmallVectorImpl<Instruction *> &Worklist) {
 
 const ScalarEvolution::BackedgeTakenInfo &
 ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
-  // Initially insert a CouldNotCompute for this loop. If the insertion
+  // Initially insert an invalid entry for this loop. If the insertion
   // succeeds, proceed to actually compute a backedge-taken count and
   // update the value. The temporary CouldNotCompute value tells SCEV
   // code elsewhere that it shouldn't attempt to request a new
   // backedge-taken count, which could result in infinite recursion.
   std::pair<DenseMap<const Loop *, BackedgeTakenInfo>::iterator, bool> Pair =
-    BackedgeTakenCounts.insert(std::make_pair(L, getCouldNotCompute()));
+    BackedgeTakenCounts.insert(std::make_pair(L, BackedgeTakenInfo()));
   if (!Pair.second)
     return Pair.first->second;
 
-  BackedgeTakenInfo Result = getCouldNotCompute();
-  BackedgeTakenInfo Computed = ComputeBackedgeTakenCount(L);
-  if (Computed.Exact != getCouldNotCompute()) {
-    assert(isLoopInvariant(Computed.Exact, L) &&
-           isLoopInvariant(Computed.Max, L) &&
+  // ComputeBackedgeTakenCount may allocate memory for its result. Inserting it
+  // into the BackedgeTakenCounts map transfers ownership. Otherwise, the result
+  // must be cleared in this scope.
+  BackedgeTakenInfo Result = ComputeBackedgeTakenCount(L);
+
+  if (Result.getExact(this) != getCouldNotCompute()) {
+    assert(isLoopInvariant(Result.getExact(this), L) &&
+           isLoopInvariant(Result.getMax(this), L) &&
            "Computed backedge-taken count isn't loop invariant for loop!");
     ++NumTripCountsComputed;
-
-    // Update the value in the map.
-    Result = Computed;
-  } else {
-    if (Computed.Max != getCouldNotCompute())
-      // Update the value in the map.
-      Result = Computed;
-    if (isa<PHINode>(L->getHeader()->begin()))
-      // Only count loops that have phi nodes as not being computable.
-      ++NumTripCountsNotComputed;
+  }
+  else if (Result.getMax(this) == getCouldNotCompute() &&
+           isa<PHINode>(L->getHeader()->begin())) {
+    // Only count loops that have phi nodes as not being computable.
+    ++NumTripCountsNotComputed;
   }
 
   // Now that we know more about the trip count for this loop, forget any
@@ -3883,7 +4046,7 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
   // conservative estimates made without the benefit of trip count
   // information. This is similar to the code in forgetLoop, except that
   // it handles SCEVUnknown PHI nodes specially.
-  if (Computed.hasAnyInfo()) {
+  if (Result.hasAnyInfo()) {
     SmallVector<Instruction *, 16> Worklist;
     PushLoopPHIs(L, Worklist);
 
@@ -3928,7 +4091,12 @@ ScalarEvolution::getBackedgeTakenInfo(const Loop *L) {
 /// compute a trip count, or if the loop is deleted.
 void ScalarEvolution::forgetLoop(const Loop *L) {
   // Drop any stored trip count value.
-  BackedgeTakenCounts.erase(L);
+  DenseMap<const Loop*, BackedgeTakenInfo>::iterator BTCPos =
+    BackedgeTakenCounts.find(L);
+  if (BTCPos != BackedgeTakenCounts.end()) {
+    BTCPos->second.clear();
+    BackedgeTakenCounts.erase(BTCPos);
+  }
 
   // Drop information about expressions based on loop-header PHIs.
   SmallVector<Instruction *, 16> Worklist;
@@ -3984,6 +4152,85 @@ void ScalarEvolution::forgetValue(Value *V) {
   }
 }
 
+/// getExact - Get the exact loop backedge taken count considering all loop
+/// exits. If all exits are computable, this is the minimum computed count.
+const SCEV *
+ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
+  // If any exits were not computable, the loop is not computable.
+  if (!ExitNotTaken.isCompleteList()) return SE->getCouldNotCompute();
+
+  // We need at least one computable exit.
+  if (!ExitNotTaken.ExitingBlock) return SE->getCouldNotCompute();
+  assert(ExitNotTaken.ExactNotTaken && "uninitialized not-taken info");
+
+  const SCEV *BECount = 0;
+  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
+       ENT != 0; ENT = ENT->getNextExit()) {
+
+    assert(ENT->ExactNotTaken != SE->getCouldNotCompute() && "bad exit SCEV");
+
+    if (!BECount)
+      BECount = ENT->ExactNotTaken;
+    else
+      BECount = SE->getUMinFromMismatchedTypes(BECount, ENT->ExactNotTaken);
+  }
+  assert(BECount && "Invalid not taken count for loop exit");
+  return BECount;
+}
+
+/// getExact - Get the exact not taken count for this loop exit.
+const SCEV *
+ScalarEvolution::BackedgeTakenInfo::getExact(BasicBlock *ExitingBlock,
+                                             ScalarEvolution *SE) const {
+  for (const ExitNotTakenInfo *ENT = &ExitNotTaken;
+       ENT != 0; ENT = ENT->getNextExit()) {
+
+    if (ENT->ExitingBlock == ExitingBlock)
+      return ENT->ExactNotTaken;
+  }
+  return SE->getCouldNotCompute();
+}
+
+/// getMax - Get the max backedge taken count for the loop.
+const SCEV *
+ScalarEvolution::BackedgeTakenInfo::getMax(ScalarEvolution *SE) const {
+  return Max ? Max : SE->getCouldNotCompute();
+}
+
+/// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each
+/// computable exit into a persistent ExitNotTakenInfo array.
+ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo(
+  SmallVectorImpl< std::pair<BasicBlock *, const SCEV *> > &ExitCounts,
+  bool Complete, const SCEV *MaxCount) : Max(MaxCount) {
+
+  if (!Complete)
+    ExitNotTaken.setIncomplete();
+
+  unsigned NumExits = ExitCounts.size();
+  if (NumExits == 0) return;
+
+  ExitNotTaken.ExitingBlock = ExitCounts[0].first;
+  ExitNotTaken.ExactNotTaken = ExitCounts[0].second;
+  if (NumExits == 1) return;
+
+  // Handle the rare case of multiple computable exits.
+  ExitNotTakenInfo *ENT = new ExitNotTakenInfo[NumExits-1];
+
+  ExitNotTakenInfo *PrevENT = &ExitNotTaken;
+  for (unsigned i = 1; i < NumExits; ++i, PrevENT = ENT, ++ENT) {
+    PrevENT->setNextExit(ENT);
+    ENT->ExitingBlock = ExitCounts[i].first;
+    ENT->ExactNotTaken = ExitCounts[i].second;
+  }
+}
+
+/// clear - Invalidate this result and free the ExitNotTakenInfo array.
+void ScalarEvolution::BackedgeTakenInfo::clear() {
+  ExitNotTaken.ExitingBlock = 0;
+  ExitNotTaken.ExactNotTaken = 0;
+  delete[] ExitNotTaken.getNextExit();
+}
+
 /// ComputeBackedgeTakenCount - Compute the number of times the backedge
 /// of the specified loop will execute.
 ScalarEvolution::BackedgeTakenInfo
@@ -3992,38 +4239,31 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
   L->getExitingBlocks(ExitingBlocks);
 
   // Examine all exits and pick the most conservative values.
-  const SCEV *BECount = getCouldNotCompute();
   const SCEV *MaxBECount = getCouldNotCompute();
-  bool CouldNotComputeBECount = false;
+  bool CouldComputeBECount = true;
+  SmallVector<std::pair<BasicBlock *, const SCEV *>, 4> ExitCounts;
   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
-    BackedgeTakenInfo NewBTI =
-      ComputeBackedgeTakenCountFromExit(L, ExitingBlocks[i]);
-
-    if (NewBTI.Exact == getCouldNotCompute()) {
+    ExitLimit EL = ComputeExitLimit(L, ExitingBlocks[i]);
+    if (EL.Exact == getCouldNotCompute())
       // We couldn't compute an exact value for this exit, so
       // we won't be able to compute an exact value for the loop.
-      CouldNotComputeBECount = true;
-      BECount = getCouldNotCompute();
-    } else if (!CouldNotComputeBECount) {
-      if (BECount == getCouldNotCompute())
-        BECount = NewBTI.Exact;
-      else
-        BECount = getUMinFromMismatchedTypes(BECount, NewBTI.Exact);
-    }
+      CouldComputeBECount = false;
+    else
+      ExitCounts.push_back(std::make_pair(ExitingBlocks[i], EL.Exact));
+
     if (MaxBECount == getCouldNotCompute())
-      MaxBECount = NewBTI.Max;
-    else if (NewBTI.Max != getCouldNotCompute())
-      MaxBECount = getUMinFromMismatchedTypes(MaxBECount, NewBTI.Max);
+      MaxBECount = EL.Max;
+    else if (EL.Max != getCouldNotCompute())
+      MaxBECount = getUMinFromMismatchedTypes(MaxBECount, EL.Max);
   }
 
-  return BackedgeTakenInfo(BECount, MaxBECount);
+  return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount);
 }
 
-/// ComputeBackedgeTakenCountFromExit - Compute the number of times the backedge
-/// of the specified loop will execute if it exits via the specified block.
-ScalarEvolution::BackedgeTakenInfo
-ScalarEvolution::ComputeBackedgeTakenCountFromExit(const Loop *L,
-                                                   BasicBlock *ExitingBlock) {
+/// ComputeExitLimit - Compute the number of times the backedge of the specified
+/// loop will execute if it exits via the specified block.
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
 
   // Okay, we've chosen an exiting block.  See what condition causes us to
   // exit at this block.
@@ -4081,95 +4321,91 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExit(const Loop *L,
   }
 
   // Proceed to the next level to examine the exit condition expression.
-  return ComputeBackedgeTakenCountFromExitCond(L, ExitBr->getCondition(),
-                                               ExitBr->getSuccessor(0),
-                                               ExitBr->getSuccessor(1));
+  return ComputeExitLimitFromCond(L, ExitBr->getCondition(),
+                                  ExitBr->getSuccessor(0),
+                                  ExitBr->getSuccessor(1));
 }
 
-/// ComputeBackedgeTakenCountFromExitCond - Compute the number of times the
+/// ComputeExitLimitFromCond - Compute the number of times the
 /// backedge of the specified loop will execute if its exit condition
 /// were a conditional branch of ExitCond, TBB, and FBB.
-ScalarEvolution::BackedgeTakenInfo
-ScalarEvolution::ComputeBackedgeTakenCountFromExitCond(const Loop *L,
-                                                       Value *ExitCond,
-                                                       BasicBlock *TBB,
-                                                       BasicBlock *FBB) {
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
+                                          Value *ExitCond,
+                                          BasicBlock *TBB,
+                                          BasicBlock *FBB) {
   // Check if the controlling expression for this loop is an And or Or.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
     if (BO->getOpcode() == Instruction::And) {
       // Recurse on the operands of the and.
-      BackedgeTakenInfo BTI0 =
-        ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(0), TBB, FBB);
-      BackedgeTakenInfo BTI1 =
-        ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(1), TBB, FBB);
+      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
+      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (L->contains(TBB)) {
         // Both conditions must be true for the loop to continue executing.
         // Choose the less conservative count.
-        if (BTI0.Exact == getCouldNotCompute() ||
-            BTI1.Exact == getCouldNotCompute())
+        if (EL0.Exact == getCouldNotCompute() ||
+            EL1.Exact == getCouldNotCompute())
           BECount = getCouldNotCompute();
         else
-          BECount = getUMinFromMismatchedTypes(BTI0.Exact, BTI1.Exact);
-        if (BTI0.Max == getCouldNotCompute())
-          MaxBECount = BTI1.Max;
-        else if (BTI1.Max == getCouldNotCompute())
-          MaxBECount = BTI0.Max;
+          BECount = getUMinFromMismatchedTypes(EL0.Exact, EL1.Exact);
+        if (EL0.Max == getCouldNotCompute())
+          MaxBECount = EL1.Max;
+        else if (EL1.Max == getCouldNotCompute())
+          MaxBECount = EL0.Max;
         else
-          MaxBECount = getUMinFromMismatchedTypes(BTI0.Max, BTI1.Max);
+          MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max);
       } else {
         // Both conditions must be true at the same time for the loop to exit.
         // For now, be conservative.
         assert(L->contains(FBB) && "Loop block has no successor in loop!");
-        if (BTI0.Max == BTI1.Max)
-          MaxBECount = BTI0.Max;
-        if (BTI0.Exact == BTI1.Exact)
-          BECount = BTI0.Exact;
+        if (EL0.Max == EL1.Max)
+          MaxBECount = EL0.Max;
+        if (EL0.Exact == EL1.Exact)
+          BECount = EL0.Exact;
       }
 
-      return BackedgeTakenInfo(BECount, MaxBECount);
+      return ExitLimit(BECount, MaxBECount);
     }
     if (BO->getOpcode() == Instruction::Or) {
       // Recurse on the operands of the or.
-      BackedgeTakenInfo BTI0 =
-        ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(0), TBB, FBB);
-      BackedgeTakenInfo BTI1 =
-        ComputeBackedgeTakenCountFromExitCond(L, BO->getOperand(1), TBB, FBB);
+      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
+      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
       if (L->contains(FBB)) {
         // Both conditions must be false for the loop to continue executing.
         // Choose the less conservative count.
-        if (BTI0.Exact == getCouldNotCompute() ||
-            BTI1.Exact == getCouldNotCompute())
+        if (EL0.Exact == getCouldNotCompute() ||
+            EL1.Exact == getCouldNotCompute())
           BECount = getCouldNotCompute();
         else
-          BECount = getUMinFromMismatchedTypes(BTI0.Exact, BTI1.Exact);
-        if (BTI0.Max == getCouldNotCompute())
-          MaxBECount = BTI1.Max;
-        else if (BTI1.Max == getCouldNotCompute())
-          MaxBECount = BTI0.Max;
+          BECount = getUMinFromMismatchedTypes(EL0.Exact, EL1.Exact);
+        if (EL0.Max == getCouldNotCompute())
+          MaxBECount = EL1.Max;
+        else if (EL1.Max == getCouldNotCompute())
+          MaxBECount = EL0.Max;
         else
-          MaxBECount = getUMinFromMismatchedTypes(BTI0.Max, BTI1.Max);
+          MaxBECount = getUMinFromMismatchedTypes(EL0.Max, EL1.Max);
       } else {
         // Both conditions must be false at the same time for the loop to exit.
         // For now, be conservative.
         assert(L->contains(TBB) && "Loop block has no successor in loop!");
-        if (BTI0.Max == BTI1.Max)
-          MaxBECount = BTI0.Max;
-        if (BTI0.Exact == BTI1.Exact)
-          BECount = BTI0.Exact;
+        if (EL0.Max == EL1.Max)
+          MaxBECount = EL0.Max;
+        if (EL0.Exact == EL1.Exact)
+          BECount = EL0.Exact;
       }
 
-      return BackedgeTakenInfo(BECount, MaxBECount);
+      return ExitLimit(BECount, MaxBECount);
     }
   }
 
   // With an icmp, it may be feasible to compute an exact backedge-taken count.
   // Proceed to the next level to examine the icmp.
   if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond))
-    return ComputeBackedgeTakenCountFromExitCondICmp(L, ExitCondICmp, TBB, FBB);
+    return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB);
 
   // Check for a constant condition. These are normally stripped out by
   // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
@@ -4185,17 +4421,17 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCond(const Loop *L,
   }
 
   // If it's not an integer or pointer comparison then compute it the hard way.
-  return ComputeBackedgeTakenCountExhaustively(L, ExitCond, !L->contains(TBB));
+  return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
 }
 
-/// ComputeBackedgeTakenCountFromExitCondICmp - Compute the number of times the
+/// ComputeExitLimitFromICmp - Compute the number of times the
 /// backedge of the specified loop will execute if its exit condition
 /// were a conditional branch of the ICmpInst ExitCond, TBB, and FBB.
-ScalarEvolution::BackedgeTakenInfo
-ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
-                                                           ICmpInst *ExitCond,
-                                                           BasicBlock *TBB,
-                                                           BasicBlock *FBB) {
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
+                                          ICmpInst *ExitCond,
+                                          BasicBlock *TBB,
+                                          BasicBlock *FBB) {
 
   // If the condition was exit on true, convert the condition to exit on false
   ICmpInst::Predicate Cond;
@@ -4207,8 +4443,8 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
   // Handle common loops like: for (X = "string"; *X; ++X)
   if (LoadInst *LI = dyn_cast<LoadInst>(ExitCond->getOperand(0)))
     if (Constant *RHS = dyn_cast<Constant>(ExitCond->getOperand(1))) {
-      BackedgeTakenInfo ItCnt =
-        ComputeLoadConstantCompareBackedgeTakenCount(LI, RHS, L, Cond);
+      ExitLimit ItCnt =
+        ComputeLoadConstantCompareExitLimit(LI, RHS, L, Cond);
       if (ItCnt.hasAnyInfo())
         return ItCnt;
     }
@@ -4247,36 +4483,36 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
   switch (Cond) {
   case ICmpInst::ICMP_NE: {                     // while (X != Y)
     // Convert to: while (X-Y != 0)
-    BackedgeTakenInfo BTI = HowFarToZero(getMinusSCEV(LHS, RHS), L);
-    if (BTI.hasAnyInfo()) return BTI;
+    ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L);
+    if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_EQ: {                     // while (X == Y)
     // Convert to: while (X-Y == 0)
-    BackedgeTakenInfo BTI = HowFarToNonZero(getMinusSCEV(LHS, RHS), L);
-    if (BTI.hasAnyInfo()) return BTI;
+    ExitLimit EL = HowFarToNonZero(getMinusSCEV(LHS, RHS), L);
+    if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_SLT: {
-    BackedgeTakenInfo BTI = HowManyLessThans(LHS, RHS, L, true);
-    if (BTI.hasAnyInfo()) return BTI;
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, true);
+    if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_SGT: {
-    BackedgeTakenInfo BTI = HowManyLessThans(getNotSCEV(LHS),
+    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
                                              getNotSCEV(RHS), L, true);
-    if (BTI.hasAnyInfo()) return BTI;
+    if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_ULT: {
-    BackedgeTakenInfo BTI = HowManyLessThans(LHS, RHS, L, false);
-    if (BTI.hasAnyInfo()) return BTI;
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, false);
+    if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_UGT: {
-    BackedgeTakenInfo BTI = HowManyLessThans(getNotSCEV(LHS),
+    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
                                              getNotSCEV(RHS), L, false);
-    if (BTI.hasAnyInfo()) return BTI;
+    if (EL.hasAnyInfo()) return EL;
     break;
   }
   default:
@@ -4290,8 +4526,7 @@ ScalarEvolution::ComputeBackedgeTakenCountFromExitCondICmp(const Loop *L,
 #endif
     break;
   }
-  return
-    ComputeBackedgeTakenCountExhaustively(L, ExitCond, !L->contains(TBB));
+  return ComputeExitCountExhaustively(L, ExitCond, !L->contains(TBB));
 }
 
 static ConstantInt *
@@ -4321,10 +4556,10 @@ GetAddressedElementFromGlobal(GlobalVariable *GV,
       if (Idx >= CA->getNumOperands()) return 0;  // Bogus program
       Init = cast<Constant>(CA->getOperand(Idx));
     } else if (isa<ConstantAggregateZero>(Init)) {
-      if (const StructType *STy = dyn_cast<StructType>(Init->getType())) {
+      if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
         assert(Idx < STy->getNumElements() && "Bad struct index!");
         Init = Constant::getNullValue(STy->getElementType(Idx));
-      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Init->getType())) {
+      } else if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType())) {
         if (Idx >= ATy->getNumElements()) return 0;  // Bogus program
         Init = Constant::getNullValue(ATy->getElementType());
       } else {
@@ -4338,15 +4573,16 @@ GetAddressedElementFromGlobal(GlobalVariable *GV,
   return Init;
 }
 
-/// ComputeLoadConstantCompareBackedgeTakenCount - Given an exit condition of
+/// ComputeLoadConstantCompareExitLimit - Given an exit condition of
 /// 'icmp op load X, cst', try to see if we can compute the backedge
 /// execution count.
-ScalarEvolution::BackedgeTakenInfo
-ScalarEvolution::ComputeLoadConstantCompareBackedgeTakenCount(
-                                                LoadInst *LI,
-                                                Constant *RHS,
-                                                const Loop *L,
-                                                ICmpInst::Predicate predicate) {
+ScalarEvolution::ExitLimit
+ScalarEvolution::ComputeLoadConstantCompareExitLimit(
+  LoadInst *LI,
+  Constant *RHS,
+  const Loop *L,
+  ICmpInst::Predicate predicate) {
+
   if (LI->isVolatile()) return getCouldNotCompute();
 
   // Check to see if the loaded pointer is a getelementptr of a global.
@@ -4431,69 +4667,117 @@ static bool CanConstantFold(const Instruction *I) {
   return false;
 }
 
-/// getConstantEvolvingPHI - Given an LLVM value and a loop, return a PHI node
-/// in the loop that V is derived from.  We allow arbitrary operations along the
-/// way, but the operands of an operation must either be constants or a value
-/// derived from a constant PHI.  If this expression does not fit with these
-/// constraints, return null.
-static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
-  // If this is not an instruction, or if this is an instruction outside of the
-  // loop, it can't be derived from a loop PHI.
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (I == 0 || !L->contains(I)) return 0;
+/// Determine whether this instruction can constant evolve within this loop
+/// assuming its operands can all constant evolve.
+static bool canConstantEvolve(Instruction *I, const Loop *L) {
+  // An instruction outside of the loop can't be derived from a loop PHI.
+  if (!L->contains(I)) return false;
 
-  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+  if (isa<PHINode>(I)) {
     if (L->getHeader() == I->getParent())
-      return PN;
+      return true;
     else
       // We don't currently keep track of the control flow needed to evaluate
       // PHIs, so we cannot handle PHIs inside of loops.
-      return 0;
+      return false;
   }
 
   // If we won't be able to constant fold this expression even if the operands
-  // are constants, return early.
-  if (!CanConstantFold(I)) return 0;
+  // are constants, bail early.
+  return CanConstantFold(I);
+}
+
+/// getConstantEvolvingPHIOperands - Implement getConstantEvolvingPHI by
+/// recursing through each instruction operand until reaching a loop header phi.
+static PHINode *
+getConstantEvolvingPHIOperands(Instruction *UseInst, const Loop *L,
+                               DenseMap<Instruction *, PHINode *> &PHIMap) {
 
   // Otherwise, we can evaluate this instruction if all of its operands are
   // constant or derived from a PHI node themselves.
   PHINode *PHI = 0;
-  for (unsigned Op = 0, e = I->getNumOperands(); Op != e; ++Op)
-    if (!isa<Constant>(I->getOperand(Op))) {
-      PHINode *P = getConstantEvolvingPHI(I->getOperand(Op), L);
-      if (P == 0) return 0;  // Not evolving from PHI
-      if (PHI == 0)
-        PHI = P;
-      else if (PHI != P)
-        return 0;  // Evolving from multiple different PHIs.
+  for (Instruction::op_iterator OpI = UseInst->op_begin(),
+         OpE = UseInst->op_end(); OpI != OpE; ++OpI) {
+
+    if (isa<Constant>(*OpI)) continue;
+
+    Instruction *OpInst = dyn_cast<Instruction>(*OpI);
+    if (!OpInst || !canConstantEvolve(OpInst, L)) return 0;
+
+    PHINode *P = dyn_cast<PHINode>(OpInst);
+    if (!P)
+      // If this operand is already visited, reuse the prior result.
+      // We may have P != PHI if this is the deepest point at which the
+      // inconsistent paths meet.
+      P = PHIMap.lookup(OpInst);
+    if (!P) {
+      // Recurse and memoize the results, whether a phi is found or not.
+      // This recursive call invalidates pointers into PHIMap.
+      P = getConstantEvolvingPHIOperands(OpInst, L, PHIMap);
+      PHIMap[OpInst] = P;
     }
-
+    if (P == 0) return 0;        // Not evolving from PHI
+    if (PHI && PHI != P) return 0;  // Evolving from multiple different PHIs.
+    PHI = P;
+  }
   // This is a expression evolving from a constant PHI!
   return PHI;
 }
 
+/// getConstantEvolvingPHI - Given an LLVM value and a loop, return a PHI node
+/// in the loop that V is derived from.  We allow arbitrary operations along the
+/// way, but the operands of an operation must either be constants or a value
+/// derived from a constant PHI.  If this expression does not fit with these
+/// constraints, return null.
+static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0 || !canConstantEvolve(I, L)) return 0;
+
+  if (PHINode *PN = dyn_cast<PHINode>(I)) {
+    return PN;
+  }
+
+  // Record non-constant instructions contained by the loop.
+  DenseMap<Instruction *, PHINode *> PHIMap;
+  return getConstantEvolvingPHIOperands(I, L, PHIMap);
+}
+
 /// EvaluateExpression - Given an expression that passes the
 /// getConstantEvolvingPHI predicate, evaluate its value assuming the PHI node
 /// in the loop has the value PHIVal.  If we can't fold this expression for some
 /// reason, return null.
-static Constant *EvaluateExpression(Value *V, Constant *PHIVal,
+static Constant *EvaluateExpression(Value *V, const Loop *L,
+                                    DenseMap<Instruction *, Constant *> &Vals,
                                     const TargetData *TD) {
-  if (isa<PHINode>(V)) return PHIVal;
+  // Convenient constant check, but redundant for recursive calls.
   if (Constant *C = dyn_cast<Constant>(V)) return C;
+
   Instruction *I = cast<Instruction>(V);
+  if (Constant *C = Vals.lookup(I)) return C;
+
+  assert(!isa<PHINode>(I) && "loop header phis should be mapped to constant");
+  assert(canConstantEvolve(I, L) && "cannot evaluate expression in this loop");
+  (void)L;
 
   std::vector<Constant*> Operands(I->getNumOperands());
 
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-    Operands[i] = EvaluateExpression(I->getOperand(i), PHIVal, TD);
-    if (Operands[i] == 0) return 0;
+    Instruction *Operand = dyn_cast<Instruction>(I->getOperand(i));
+    if (!Operand) {
+      Operands[i] = dyn_cast<Constant>(I->getOperand(i));
+      if (!Operands[i]) return 0;
+      continue;
+    }
+    Constant *C = EvaluateExpression(Operand, L, Vals, TD);
+    Vals[Operand] = C;
+    if (!C) return 0;
+    Operands[i] = C;
   }
 
   if (const CmpInst *CI = dyn_cast<CmpInst>(I))
     return ConstantFoldCompareInstOperands(CI->getPredicate(), Operands[0],
                                            Operands[1], TD);
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                  &Operands[0], Operands.size(), TD);
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Operands, TD);
 }
 
 /// getConstantEvolutionLoopExitValue - If we know that the specified Phi is
@@ -4514,6 +4798,9 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
 
   Constant *&RetVal = ConstantEvolutionLoopExitValue[PN];
 
+  // FIXME: Nick's fix for PR11034 will seed constants for multiple header phis.
+  DenseMap<Instruction *, Constant *> CurrentIterVals;
+
   // Since the loop is canonicalized, the PHI node must have two entries.  One
   // entry must be a constant (coming in from outside of the loop), and the
   // second must be derived from the same PHI.
@@ -4522,6 +4809,7 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
     dyn_cast<Constant>(PN->getIncomingValue(!SecondIsBackedge));
   if (StartCST == 0)
     return RetVal = 0;  // Must be a constant.
+  CurrentIterVals[PN] = StartCST;
 
   Value *BEValue = PN->getIncomingValue(SecondIsBackedge);
   if (getConstantEvolvingPHI(BEValue, L) != PN &&
@@ -4534,29 +4822,31 @@ ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
 
   unsigned NumIterations = BEs.getZExtValue(); // must be in range
   unsigned IterationNum = 0;
-  for (Constant *PHIVal = StartCST; ; ++IterationNum) {
+  for (; ; ++IterationNum) {
     if (IterationNum == NumIterations)
-      return RetVal = PHIVal;  // Got exit value!
+      return RetVal = CurrentIterVals[PN];  // Got exit value!
 
     // Compute the value of the PHI node for the next iteration.
-    Constant *NextPHI = EvaluateExpression(BEValue, PHIVal, TD);
-    if (NextPHI == PHIVal)
+    // EvaluateExpression adds non-phi values to the CurrentIterVals map.
+    Constant *NextPHI = EvaluateExpression(BEValue, L, CurrentIterVals, TD);
+    if (NextPHI == CurrentIterVals[PN])
       return RetVal = NextPHI;  // Stopped evolving!
     if (NextPHI == 0)
       return 0;        // Couldn't evaluate!
-    PHIVal = NextPHI;
+    DenseMap<Instruction *, Constant *> NextIterVals;
+    NextIterVals[PN] = NextPHI;
+    CurrentIterVals.swap(NextIterVals);
   }
 }
 
-/// ComputeBackedgeTakenCountExhaustively - If the loop is known to execute a
+/// ComputeExitCountExhaustively - If the loop is known to execute a
 /// constant number of times (the condition evolves only from constants),
 /// try to evaluate a few iterations of the loop until we get the exit
 /// condition gets a value of ExitWhen (true or false).  If we cannot
 /// evaluate the trip count of the loop, return getCouldNotCompute().
-const SCEV *
-ScalarEvolution::ComputeBackedgeTakenCountExhaustively(const Loop *L,
-                                                       Value *Cond,
-                                                       bool ExitWhen) {
+const SCEV * ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
+                                                           Value *Cond,
+                                                           bool ExitWhen) {
   PHINode *PN = getConstantEvolvingPHI(Cond, L);
   if (PN == 0) return getCouldNotCompute();
 
@@ -4583,8 +4873,10 @@ ScalarEvolution::ComputeBackedgeTakenCountExhaustively(const Loop *L,
   unsigned MaxIterations = MaxBruteForceIterations;   // Limit analysis.
   for (Constant *PHIVal = StartCST;
        IterationNum != MaxIterations; ++IterationNum) {
+    DenseMap<Instruction *, Constant *> PHIValMap;
+    PHIValMap[PN] = PHIVal;
     ConstantInt *CondVal =
-      dyn_cast_or_null<ConstantInt>(EvaluateExpression(Cond, PHIVal, TD));
+      dyn_cast_or_null<ConstantInt>(EvaluateExpression(Cond, L, PHIValMap, TD));
 
     // Couldn't symbolically evaluate.
     if (!CondVal) return getCouldNotCompute();
@@ -4595,7 +4887,7 @@ ScalarEvolution::ComputeBackedgeTakenCountExhaustively(const Loop *L,
     }
 
     // Compute the value of the PHI node for the next iteration.
-    Constant *NextPHI = EvaluateExpression(BEValue, PHIVal, TD);
+    Constant *NextPHI = EvaluateExpression(BEValue, L, PHIValMap, TD);
     if (NextPHI == 0 || NextPHI == PHIVal)
       return getCouldNotCompute();// Couldn't evaluate or not making progress...
     PHIVal = NextPHI;
@@ -4703,7 +4995,7 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
                                                 Operands[0], Operands[1], TD);
           else
             C = ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                         &Operands[0], Operands.size(), TD);
+                                         Operands, TD);
           if (!C) return V;
           return getSCEV(C);
         }
@@ -4925,7 +5217,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
     // Compute the two solutions for the quadratic formula.
     // The divisions must be performed as signed divisions.
     APInt NegB(-B);
-    APInt TwoA( A << 1 );
+    APInt TwoA(A << 1);
     if (TwoA.isMinValue()) {
       const SCEV *CNC = SE.getCouldNotCompute();
       return std::make_pair(CNC, CNC);
@@ -4940,7 +5232,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
 
     return std::make_pair(SE.getConstant(Solution1),
                           SE.getConstant(Solution2));
-    } // end APIntOps namespace
+  } // end APIntOps namespace
 }
 
 /// HowFarToZero - Return the number of times a backedge comparing the specified
@@ -4950,7 +5242,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
 /// now expressed as a single expression, V = x-y. So the exit test is
 /// effectively V != 0.  We know and take advantage of the fact that this
 /// expression only being used in a comparison by zero context.
-ScalarEvolution::BackedgeTakenInfo
+ScalarEvolution::ExitLimit
 ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   // If the value is a constant
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
@@ -5034,8 +5326,19 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   // Handle unitary steps, which cannot wraparound.
   // 1*N = -Start; -1*N = Start (mod 2^BW), so:
   //   N = Distance (as unsigned)
-  if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue())
-    return Distance;
+  if (StepC->getValue()->equalsInt(1) || StepC->getValue()->isAllOnesValue()) {
+    ConstantRange CR = getUnsignedRange(Start);
+    const SCEV *MaxBECount;
+    if (!CountDown && CR.getUnsignedMin().isMinValue())
+      // When counting up, the worst starting value is 1, not 0.
+      MaxBECount = CR.getUnsignedMax().isMinValue()
+        ? getConstant(APInt::getMinValue(CR.getBitWidth()))
+        : getConstant(APInt::getMaxValue(CR.getBitWidth()));
+    else
+      MaxBECount = getConstant(CountDown ? CR.getUnsignedMax()
+                                         : -CR.getUnsignedMin());
+    return ExitLimit(Distance, MaxBECount);
+  }
 
   // If the recurrence is known not to wraparound, unsigned divide computes the
   // back edge count. We know that the value will either become zero (and thus
@@ -5062,7 +5365,7 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
 /// HowFarToNonZero - Return the number of times a backedge checking the
 /// specified value for nonzero will execute.  If not computable, return
 /// CouldNotCompute
-ScalarEvolution::BackedgeTakenInfo
+ScalarEvolution::ExitLimit
 ScalarEvolution::HowFarToNonZero(const SCEV *V, const Loop *L) {
   // Loops that look like: while (X == 0) are very strange indeed.  We don't
   // handle them yet except for the trivial case.  This could be expanded in the
@@ -5741,7 +6044,7 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
   assert(!isKnownNegative(Step) &&
          "This code doesn't handle negative strides yet!");
 
-  const Type *Ty = Start->getType();
+  Type *Ty = Start->getType();
 
   // When Start == End, we have an exact BECount == 0. Short-circuit this case
   // here because SCEV may not be able to determine that the unsigned division
@@ -5760,7 +6063,7 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
   if (!NoWrap) {
     // Check Add for unsigned overflow.
     // TODO: More sophisticated things could be done here.
-    const Type *WideTy = IntegerType::get(getContext(),
+    Type *WideTy = IntegerType::get(getContext(),
                                           getTypeSizeInBits(Ty) + 1);
     const SCEV *EDiff = getZeroExtendExpr(Diff, WideTy);
     const SCEV *ERoundUp = getZeroExtendExpr(RoundUp, WideTy);
@@ -5775,7 +6078,7 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
 /// HowManyLessThans - Return the number of times a backedge containing the
 /// specified less-than comparison will execute.  If not computable, return
 /// CouldNotCompute.
-ScalarEvolution::BackedgeTakenInfo
+ScalarEvolution::ExitLimit
 ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
                                   const Loop *L, bool isSigned) {
   // Only handle:  "ADDREC < LoopInvariant".
@@ -5882,7 +6185,7 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
     if (isa<SCEVCouldNotCompute>(MaxBECount))
       MaxBECount = BECount;
 
-    return BackedgeTakenInfo(BECount, MaxBECount);
+    return ExitLimit(BECount, MaxBECount);
   }
 
   return getCouldNotCompute();
@@ -6090,6 +6393,15 @@ void ScalarEvolution::releaseMemory() {
   FirstUnknown = 0;
 
   ValueExprMap.clear();
+
+  // Free any extra memory created for ExitNotTakenInfo in the unlikely event
+  // that a loop had multiple computable exits.
+  for (DenseMap<const Loop*, BackedgeTakenInfo>::iterator I =
+         BackedgeTakenCounts.begin(), E = BackedgeTakenCounts.end();
+       I != E; ++I) {
+    I->second.clear();
+  }
+
   BackedgeTakenCounts.clear();
   ConstantEvolutionLoopExitValue.clear();
   ValuesAtScopes.clear();
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index befe6d2..47f0f32 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/ADT/STLExtras.h"
 
@@ -26,7 +27,7 @@ using namespace llvm;
 /// reusing an existing cast if a suitable one exists, moving an existing
 /// cast if a suitable one exists but isn't in the right place, or
 /// creating a new one.
-Value *SCEVExpander::ReuseOrCreateCast(Value *V, const Type *Ty,
+Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
                                        Instruction::CastOps Op,
                                        BasicBlock::iterator IP) {
   // Check to see if there is already a cast!
@@ -62,7 +63,7 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, const Type *Ty,
 /// InsertNoopCastOfTo - Insert a cast of V to the specified type,
 /// which must be possible with a noop cast, doing what we can to share
 /// the casts.
-Value *SCEVExpander::InsertNoopCastOfTo(Value *V, const Type *Ty) {
+Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
   Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false);
   assert((Op == Instruction::BitCast ||
           Op == Instruction::PtrToInt ||
@@ -103,7 +104,8 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, const Type *Ty) {
     while ((isa<BitCastInst>(IP) &&
             isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
             cast<BitCastInst>(IP)->getOperand(0) != A) ||
-           isa<DbgInfoIntrinsic>(IP))
+           isa<DbgInfoIntrinsic>(IP) ||
+           isa<LandingPadInst>(IP))
       ++IP;
     return ReuseOrCreateCast(A, Ty, Op, IP);
   }
@@ -113,7 +115,9 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, const Type *Ty) {
   BasicBlock::iterator IP = I; ++IP;
   if (InvokeInst *II = dyn_cast<InvokeInst>(I))
     IP = II->getNormalDest()->begin();
-  while (isa<PHINode>(IP) || isa<DbgInfoIntrinsic>(IP)) ++IP;
+  while (isa<PHINode>(IP) || isa<DbgInfoIntrinsic>(IP) ||
+         isa<LandingPadInst>(IP))
+    ++IP;
   return ReuseOrCreateCast(I, Ty, Op, IP);
 }
 
@@ -160,7 +164,7 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   }
 
   // If we haven't found this binop, insert it.
-  Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS, "tmp"));
+  Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
   BO->setDebugLoc(SaveInsertPt->getDebugLoc());
   rememberInstruction(BO);
 
@@ -277,7 +281,7 @@ static bool FactorOutConstant(const SCEV *&S,
 /// the list.
 ///
 static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops,
-                                const Type *Ty,
+                                Type *Ty,
                                 ScalarEvolution &SE) {
   unsigned NumAddRecs = 0;
   for (unsigned i = Ops.size(); i > 0 && isa<SCEVAddRecExpr>(Ops[i-1]); --i)
@@ -306,7 +310,7 @@ static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops,
 /// into GEP indices.
 ///
 static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
-                         const Type *Ty,
+                         Type *Ty,
                          ScalarEvolution &SE) {
   // Find the addrecs.
   SmallVector<const SCEV *, 8> AddRecs;
@@ -365,10 +369,10 @@ static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
 ///
 Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
                                     const SCEV *const *op_end,
-                                    const PointerType *PTy,
-                                    const Type *Ty,
+                                    PointerType *PTy,
+                                    Type *Ty,
                                     Value *V) {
-  const Type *ElTy = PTy->getElementType();
+  Type *ElTy = PTy->getElementType();
   SmallVector<Value *, 4> GepIndices;
   SmallVector<const SCEV *, 8> Ops(op_begin, op_end);
   bool AnyNonZeroIndices = false;
@@ -423,7 +427,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     GepIndices.push_back(Scaled);
 
     // Collect struct field index operands.
-    while (const StructType *STy = dyn_cast<StructType>(ElTy)) {
+    while (StructType *STy = dyn_cast<StructType>(ElTy)) {
       bool FoundFieldNo = false;
       // An empty struct has no fields.
       if (STy->getNumElements() == 0) break;
@@ -451,7 +455,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
         // appropriate struct type.
         for (unsigned i = 0, e = Ops.size(); i != e; ++i)
           if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Ops[i])) {
-            const Type *CTy;
+            Type *CTy;
             Constant *FieldNo;
             if (U->isOffsetOf(CTy, FieldNo) && CTy == STy) {
               GepIndices.push_back(FieldNo);
@@ -474,7 +478,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
       }
     }
 
-    if (const ArrayType *ATy = dyn_cast<ArrayType>(ElTy))
+    if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy))
       ElTy = ATy->getElementType();
     else
       break;
@@ -494,7 +498,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     // Fold a GEP with constant operands.
     if (Constant *CLHS = dyn_cast<Constant>(V))
       if (Constant *CRHS = dyn_cast<Constant>(Idx))
-        return ConstantExpr::getGetElementPtr(CLHS, &CRHS, 1);
+        return ConstantExpr::getGetElementPtr(CLHS, CRHS);
 
     // Do a quick scan to see if we have this GEP nearby.  If so, reuse it.
     unsigned ScanLimit = 6;
@@ -572,8 +576,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   if (V->getType() != PTy)
     Casted = InsertNoopCastOfTo(Casted, PTy);
   Value *GEP = Builder.CreateGEP(Casted,
-                                 GepIndices.begin(),
-                                 GepIndices.end(),
+                                 GepIndices,
                                  "scevgep");
   Ops.push_back(SE.getUnknown(GEP));
   rememberInstruction(GEP);
@@ -691,7 +694,7 @@ public:
 }
 
 Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
-  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
 
   // Collect all the add operands in a loop, along with their associated loops.
   // Iterate in reverse so that constants are emitted last, all else equal, and
@@ -717,7 +720,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
       // This is the first operand. Just expand it.
       Sum = expand(Op);
       ++I;
-    } else if (const PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
+    } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
       // The running sum expression is a pointer. Try to form a getelementptr
       // at this level with that as the base.
       SmallVector<const SCEV *, 4> NewOps;
@@ -731,7 +734,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
         NewOps.push_back(X);
       }
       Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum);
-    } else if (const PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
+    } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
       // The running sum is an integer, and there's a pointer at this level.
       // Try to form a getelementptr. If the running sum is instructions,
       // use a SCEVUnknown to avoid re-analyzing them.
@@ -762,7 +765,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
 }
 
 Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
-  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
 
   // Collect all the mul operands in a loop, along with their associated loops.
   // Iterate in reverse so that constants are emitted last, all else equal.
@@ -804,7 +807,7 @@ Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
 }
 
 Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
-  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
 
   Value *LHS = expandCodeFor(S->getLHS(), Ty);
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
@@ -841,81 +844,141 @@ static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
   }
 }
 
+/// Determine if this is a well-behaved chain of instructions leading back to
+/// the PHI. If so, it may be reused by expanded expressions.
+bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV,
+                                         const Loop *L) {
+  if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) ||
+      (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV)))
+    return false;
+  // If any of the operands don't dominate the insert position, bail.
+  // Addrec operands are always loop-invariant, so this can only happen
+  // if there are instructions which haven't been hoisted.
+  if (L == IVIncInsertLoop) {
+    for (User::op_iterator OI = IncV->op_begin()+1,
+           OE = IncV->op_end(); OI != OE; ++OI)
+      if (Instruction *OInst = dyn_cast<Instruction>(OI))
+        if (!SE.DT->dominates(OInst, IVIncInsertPos))
+          return false;
+  }
+  // Advance to the next instruction.
+  IncV = dyn_cast<Instruction>(IncV->getOperand(0));
+  if (!IncV)
+    return false;
+
+  if (IncV->mayHaveSideEffects())
+    return false;
+
+  if (IncV != PN)
+    return true;
+
+  return isNormalAddRecExprPHI(PN, IncV, L);
+}
+
+/// Determine if this cyclic phi is in a form that would have been generated by
+/// LSR. We don't care if the phi was actually expanded in this pass, as long
+/// as it is in a low-cost form, for example, no implied multiplication. This
+/// should match any patterns generated by getAddRecExprPHILiterally and
+/// expandAddtoGEP.
+bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV,
+                                           const Loop *L) {
+  switch (IncV->getOpcode()) {
+  // Check for a simple Add/Sub or GEP of a loop invariant step.
+  case Instruction::Add:
+  case Instruction::Sub:
+    return IncV->getOperand(0) == PN
+      && L->isLoopInvariant(IncV->getOperand(1));
+  case Instruction::BitCast:
+    IncV = dyn_cast<GetElementPtrInst>(IncV->getOperand(0));
+    if (!IncV)
+      return false;
+    // fall-thru to GEP handling
+  case Instruction::GetElementPtr: {
+    // This must be a pointer addition of constants (pretty) or some number of
+    // address-size elements (ugly).
+    for (Instruction::op_iterator I = IncV->op_begin()+1, E = IncV->op_end();
+         I != E; ++I) {
+      if (isa<Constant>(*I))
+        continue;
+      // ugly geps have 2 operands.
+      // i1* is used by the expander to represent an address-size element.
+      if (IncV->getNumOperands() != 2)
+        return false;
+      unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace();
+      if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS)
+          && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS))
+        return false;
+      // Ensure the operands dominate the insertion point. I don't know of a
+      // case when this would not be true, so this is somewhat untested.
+      if (L == IVIncInsertLoop) {
+        for (User::op_iterator OI = IncV->op_begin()+1,
+               OE = IncV->op_end(); OI != OE; ++OI)
+          if (Instruction *OInst = dyn_cast<Instruction>(OI))
+            if (!SE.DT->dominates(OInst, IVIncInsertPos))
+              return false;
+      }
+      break;
+    }
+    IncV = dyn_cast<Instruction>(IncV->getOperand(0));
+    if (IncV && IncV->getOpcode() == Instruction::BitCast)
+      IncV = dyn_cast<Instruction>(IncV->getOperand(0));
+    return IncV == PN;
+  }
+  default:
+    return false;
+  }
+}
+
 /// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand
 /// the base addrec, which is the addrec without any non-loop-dominating
 /// values, and return the PHI.
 PHINode *
 SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
                                         const Loop *L,
-                                        const Type *ExpandTy,
-                                        const Type *IntTy) {
+                                        Type *ExpandTy,
+                                        Type *IntTy) {
   assert((!IVIncInsertLoop||IVIncInsertPos) && "Uninitialized insert position");
 
   // Reuse a previously-inserted PHI, if present.
-  for (BasicBlock::iterator I = L->getHeader()->begin();
-       PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    if (SE.isSCEVable(PN->getType()) &&
-        (SE.getEffectiveSCEVType(PN->getType()) ==
-         SE.getEffectiveSCEVType(Normalized->getType())) &&
-        SE.getSCEV(PN) == Normalized)
-      if (BasicBlock *LatchBlock = L->getLoopLatch()) {
-        Instruction *IncV =
-          cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock));
-
-        // Determine if this is a well-behaved chain of instructions leading
-        // back to the PHI. It probably will be, if we're scanning an inner
-        // loop already visited by LSR for example, but it wouldn't have
-        // to be.
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  if (LatchBlock) {
+    for (BasicBlock::iterator I = L->getHeader()->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      if (!SE.isSCEVable(PN->getType()) ||
+          (SE.getEffectiveSCEVType(PN->getType()) !=
+           SE.getEffectiveSCEVType(Normalized->getType())) ||
+          SE.getSCEV(PN) != Normalized)
+        continue;
+
+      Instruction *IncV =
+        cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock));
+
+      if (LSRMode) {
+        if (!isExpandedAddRecExprPHI(PN, IncV, L))
+          continue;
+      }
+      else {
+        if (!isNormalAddRecExprPHI(PN, IncV, L))
+          continue;
+      }
+      // Ok, the add recurrence looks usable.
+      // Remember this PHI, even in post-inc mode.
+      InsertedValues.insert(PN);
+      // Remember the increment.
+      rememberInstruction(IncV);
+      if (L == IVIncInsertLoop)
         do {
-          if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) ||
-              (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV))) {
-            IncV = 0;
+          if (SE.DT->dominates(IncV, IVIncInsertPos))
             break;
-          }
-          // If any of the operands don't dominate the insert position, bail.
-          // Addrec operands are always loop-invariant, so this can only happen
-          // if there are instructions which haven't been hoisted.
-          if (L == IVIncInsertLoop) {
-            for (User::op_iterator OI = IncV->op_begin()+1,
-                   OE = IncV->op_end(); OI != OE; ++OI)
-              if (Instruction *OInst = dyn_cast<Instruction>(OI))
-                if (!SE.DT->dominates(OInst, IVIncInsertPos)) {
-                  IncV = 0;
-                  break;
-                }
-          }
-          if (!IncV)
-            break;
-          // Advance to the next instruction.
-          IncV = dyn_cast<Instruction>(IncV->getOperand(0));
-          if (!IncV)
-            break;
-          if (IncV->mayHaveSideEffects()) {
-            IncV = 0;
-            break;
-          }
+          // Make sure the increment is where we want it. But don't move it
+          // down past a potential existing post-inc user.
+          IncV->moveBefore(IVIncInsertPos);
+          IVIncInsertPos = IncV;
+          IncV = cast<Instruction>(IncV->getOperand(0));
         } while (IncV != PN);
-
-        if (IncV) {
-          // Ok, the add recurrence looks usable.
-          // Remember this PHI, even in post-inc mode.
-          InsertedValues.insert(PN);
-          // Remember the increment.
-          IncV = cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock));
-          rememberInstruction(IncV);
-          if (L == IVIncInsertLoop)
-            do {
-              if (SE.DT->dominates(IncV, IVIncInsertPos))
-                break;
-              // Make sure the increment is where we want it. But don't move it
-              // down past a potential existing post-inc user.
-              IncV->moveBefore(IVIncInsertPos);
-              IVIncInsertPos = IncV;
-              IncV = cast<Instruction>(IncV->getOperand(0));
-            } while (IncV != PN);
-          return PN;
-        }
-      }
+      return PN;
+    }
+  }
 
   // Save the original insertion point so we can restore it when we're done.
   BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
@@ -969,7 +1032,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
     Value *IncV;
     // If the PHI is a pointer, use a GEP, otherwise use an add or sub.
     if (isPointer) {
-      const PointerType *GEPPtrTy = cast<PointerType>(ExpandTy);
+      PointerType *GEPPtrTy = cast<PointerType>(ExpandTy);
       // If the step isn't constant, don't use an implicitly scaled GEP, because
       // that would require a multiply inside the loop.
       if (!isa<ConstantInt>(StepV))
@@ -978,7 +1041,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       const SCEV *const StepArray[1] = { SE.getSCEV(StepV) };
       IncV = expandAddToGEP(StepArray, StepArray+1, GEPPtrTy, IntTy, PN);
       if (IncV->getType() != PN->getType()) {
-        IncV = Builder.CreateBitCast(IncV, PN->getType(), "tmp");
+        IncV = Builder.CreateBitCast(IncV, PN->getType());
         rememberInstruction(IncV);
       }
     } else {
@@ -1001,8 +1064,8 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
 }
 
 Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
-  const Type *STy = S->getType();
-  const Type *IntTy = SE.getEffectiveSCEVType(STy);
+  Type *STy = S->getType();
+  Type *IntTy = SE.getEffectiveSCEVType(STy);
   const Loop *L = S->getLoop();
 
   // Determine a normalized form of this expression, which is the expression
@@ -1045,7 +1108,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
 
   // Expand the core addrec. If we need post-loop scaling, force it to
   // expand to an integer type to avoid the need for additional casting.
-  const Type *ExpandTy = PostLoopScale ? IntTy : STy;
+  Type *ExpandTy = PostLoopScale ? IntTy : STy;
   PHINode *PN = getAddRecExprPHILiterally(Normalized, L, ExpandTy, IntTy);
 
   // Accommodate post-inc mode, if necessary.
@@ -1057,6 +1120,14 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     BasicBlock *LatchBlock = L->getLoopLatch();
     assert(LatchBlock && "PostInc mode requires a unique loop latch!");
     Result = PN->getIncomingValueForBlock(LatchBlock);
+
+    // For an expansion to use the postinc form, the client must call
+    // expandCodeFor with an InsertPoint that is either outside the PostIncLoop
+    // or dominated by IVIncInsertPos.
+    assert((!isa<Instruction>(Result) ||
+            SE.DT->dominates(cast<Instruction>(Result),
+                             Builder.GetInsertPoint())) &&
+           "postinc expansion does not dominate use");
   }
 
   // Re-apply any non-loop-dominating scale.
@@ -1069,7 +1140,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
 
   // Re-apply any non-loop-dominating offset.
   if (PostLoopOffset) {
-    if (const PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
+    if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
       const SCEV *const OffsetArray[1] = { PostLoopOffset };
       Result = expandAddToGEP(OffsetArray, OffsetArray+1, PTy, IntTy, Result);
     } else {
@@ -1086,7 +1157,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
 Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   if (!CanonicalMode) return expandAddRecExprLiterally(S);
 
-  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
   const Loop *L = S->getLoop();
 
   // First check for an existing canonical IV in a suitable type.
@@ -1110,7 +1181,8 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
     BasicBlock::iterator NewInsertPt =
       llvm::next(BasicBlock::iterator(cast<Instruction>(V)));
-    while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt))
+    while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) ||
+           isa<LandingPadInst>(NewInsertPt))
       ++NewInsertPt;
     V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
                       NewInsertPt);
@@ -1132,7 +1204,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     // Dig into the expression to find the pointer base for a GEP.
     ExposePointerBase(Base, RestArray[0], SE);
     // If we found a pointer, expand the AddRec with a GEP.
-    if (const PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
+    if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
       // Make sure the Base isn't something exotic, such as a multiplied
       // or divided pointer value. In those cases, the result type isn't
       // actually a pointer type.
@@ -1216,35 +1288,35 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
 }
 
 Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
-  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
   Value *V = expandCodeFor(S->getOperand(),
                            SE.getEffectiveSCEVType(S->getOperand()->getType()));
-  Value *I = Builder.CreateTrunc(V, Ty, "tmp");
+  Value *I = Builder.CreateTrunc(V, Ty);
   rememberInstruction(I);
   return I;
 }
 
 Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
-  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
   Value *V = expandCodeFor(S->getOperand(),
                            SE.getEffectiveSCEVType(S->getOperand()->getType()));
-  Value *I = Builder.CreateZExt(V, Ty, "tmp");
+  Value *I = Builder.CreateZExt(V, Ty);
   rememberInstruction(I);
   return I;
 }
 
 Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
-  const Type *Ty = SE.getEffectiveSCEVType(S->getType());
+  Type *Ty = SE.getEffectiveSCEVType(S->getType());
   Value *V = expandCodeFor(S->getOperand(),
                            SE.getEffectiveSCEVType(S->getOperand()->getType()));
-  Value *I = Builder.CreateSExt(V, Ty, "tmp");
+  Value *I = Builder.CreateSExt(V, Ty);
   rememberInstruction(I);
   return I;
 }
 
 Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
   Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
-  const Type *Ty = LHS->getType();
+  Type *Ty = LHS->getType();
   for (int i = S->getNumOperands()-2; i >= 0; --i) {
     // In the case of mixed integer and pointer types, do the
     // rest of the comparisons as integer.
@@ -1253,7 +1325,7 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
       LHS = InsertNoopCastOfTo(LHS, Ty);
     }
     Value *RHS = expandCodeFor(S->getOperand(i), Ty);
-    Value *ICmp = Builder.CreateICmpSGT(LHS, RHS, "tmp");
+    Value *ICmp = Builder.CreateICmpSGT(LHS, RHS);
     rememberInstruction(ICmp);
     Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax");
     rememberInstruction(Sel);
@@ -1268,7 +1340,7 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
 
 Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
   Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
-  const Type *Ty = LHS->getType();
+  Type *Ty = LHS->getType();
   for (int i = S->getNumOperands()-2; i >= 0; --i) {
     // In the case of mixed integer and pointer types, do the
     // rest of the comparisons as integer.
@@ -1277,7 +1349,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
       LHS = InsertNoopCastOfTo(LHS, Ty);
     }
     Value *RHS = expandCodeFor(S->getOperand(i), Ty);
-    Value *ICmp = Builder.CreateICmpUGT(LHS, RHS, "tmp");
+    Value *ICmp = Builder.CreateICmpUGT(LHS, RHS);
     rememberInstruction(ICmp);
     Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax");
     rememberInstruction(Sel);
@@ -1290,7 +1362,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
   return LHS;
 }
 
-Value *SCEVExpander::expandCodeFor(const SCEV *SH, const Type *Ty,
+Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty,
                                    Instruction *I) {
   BasicBlock::iterator IP = I;
   while (isInsertedInstruction(IP) || isa<DbgInfoIntrinsic>(IP))
@@ -1299,7 +1371,7 @@ Value *SCEVExpander::expandCodeFor(const SCEV *SH, const Type *Ty,
   return expandCodeFor(SH, Ty);
 }
 
-Value *SCEVExpander::expandCodeFor(const SCEV *SH, const Type *Ty) {
+Value *SCEVExpander::expandCodeFor(const SCEV *SH, Type *Ty) {
   // Expand the code for this SCEV.
   Value *V = expand(SH);
   if (Ty) {
@@ -1325,7 +1397,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
       // after the PHIs (and after any other instructions that we've inserted
       // there) so that it is guaranteed to dominate any user inside the loop.
       if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
-        InsertPt = L->getHeader()->getFirstNonPHI();
+        InsertPt = L->getHeader()->getFirstInsertionPt();
       while (isInsertedInstruction(InsertPt) || isa<DbgInfoIntrinsic>(InsertPt))
         InsertPt = llvm::next(BasicBlock::iterator(InsertPt));
       break;
@@ -1346,8 +1418,12 @@ Value *SCEVExpander::expand(const SCEV *S) {
   Value *V = visit(S);
 
   // Remember the expanded value for this SCEV at this location.
-  if (PostIncLoops.empty())
-    InsertedExpressions[std::make_pair(S, InsertPt)] = V;
+  //
+  // This is independent of PostIncLoops. The mapped value simply materializes
+  // the expression at this insertion point. If the mapped value happened to be
+  // a postinc expansion, it could be reused by a non postinc user, but only if
+  // its insertion point was already at the head of the loop.
+  InsertedExpressions[std::make_pair(S, InsertPt)] = V;
 
   restoreInsertPoint(SaveInsertBB, SaveInsertPt);
   return V;
@@ -1384,7 +1460,7 @@ void SCEVExpander::restoreInsertPoint(BasicBlock *BB, BasicBlock::iterator I) {
 /// starts at zero and steps by one on each iteration.
 PHINode *
 SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
-                                                    const Type *Ty) {
+                                                    Type *Ty) {
   assert(Ty->isIntegerTy() && "Can only insert integer induction variables!");
 
   // Build a SCEV for {0,+,1}<L>.
@@ -1401,3 +1477,102 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
 
   return V;
 }
+
+/// hoistStep - Attempt to hoist an IV increment above a potential use.
+///
+/// To successfully hoist, two criteria must be met:
+/// - IncV operands dominate InsertPos and
+/// - InsertPos dominates IncV
+///
+/// Meeting the second condition means that we don't need to check all of IncV's
+/// existing uses (it's moving up in the domtree).
+///
+/// This does not yet recursively hoist the operands, although that would
+/// not be difficult.
+///
+/// This does not require a SCEVExpander instance and could be replaced by a
+/// general code-insertion helper.
+bool SCEVExpander::hoistStep(Instruction *IncV, Instruction *InsertPos,
+                             const DominatorTree *DT) {
+  if (DT->dominates(IncV, InsertPos))
+    return true;
+
+  if (!DT->dominates(InsertPos->getParent(), IncV->getParent()))
+    return false;
+
+  if (IncV->mayHaveSideEffects())
+    return false;
+
+  // Attempt to hoist IncV
+  for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end();
+       OI != OE; ++OI) {
+    Instruction *OInst = dyn_cast<Instruction>(OI);
+    if (OInst && !DT->dominates(OInst, InsertPos))
+      return false;
+  }
+  IncV->moveBefore(InsertPos);
+  return true;
+}
+
+/// replaceCongruentIVs - Check for congruent phis in this loop header and
+/// replace them with their most canonical representative. Return the number of
+/// phis eliminated.
+///
+/// This does not depend on any SCEVExpander state but should be used in
+/// the same context that SCEVExpander is used.
+unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
+                                           SmallVectorImpl<WeakVH> &DeadInsts) {
+  unsigned NumElim = 0;
+  DenseMap<const SCEV *, PHINode *> ExprToIVMap;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *Phi = cast<PHINode>(I);
+    if (!SE.isSCEVable(Phi->getType()))
+      continue;
+
+    PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)];
+    if (!OrigPhiRef) {
+      OrigPhiRef = Phi;
+      continue;
+    }
+
+    // If one phi derives from the other via GEPs, types may differ.
+    // We could consider adding a bitcast here to handle it.
+    if (OrigPhiRef->getType() != Phi->getType())
+      continue;
+
+    if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+      Instruction *OrigInc =
+        cast<Instruction>(OrigPhiRef->getIncomingValueForBlock(LatchBlock));
+      Instruction *IsomorphicInc =
+        cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
+
+      // If this phi is more canonical, swap it with the original.
+      if (!isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L)
+          && isExpandedAddRecExprPHI(Phi, IsomorphicInc, L)) {
+        std::swap(OrigPhiRef, Phi);
+        std::swap(OrigInc, IsomorphicInc);
+      }
+      // Replacing the congruent phi is sufficient because acyclic redundancy
+      // elimination, CSE/GVN, should handle the rest. However, once SCEV proves
+      // that a phi is congruent, it's often the head of an IV user cycle that
+      // is isomorphic with the original phi. So it's worth eagerly cleaning up
+      // the common case of a single IV increment.
+      if (OrigInc != IsomorphicInc &&
+          OrigInc->getType() == IsomorphicInc->getType() &&
+          SE.getSCEV(OrigInc) == SE.getSCEV(IsomorphicInc) &&
+          hoistStep(OrigInc, IsomorphicInc, DT)) {
+        DEBUG_WITH_TYPE(DebugType, dbgs()
+                        << "INDVARS: Eliminated congruent iv.inc: "
+                        << *IsomorphicInc << '\n');
+        IsomorphicInc->replaceAllUsesWith(OrigInc);
+        DeadInsts.push_back(IsomorphicInc);
+      }
+    }
+    DEBUG_WITH_TYPE(DebugType, dbgs()
+                    << "INDVARS: Eliminated congruent iv: " << *Phi << '\n');
+    ++NumElim;
+    Phi->replaceAllUsesWith(OrigPhiRef);
+    DeadInsts.push_back(Phi);
+  }
+  return NumElim;
+}
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index 60e630a..c66ecd6 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -60,20 +60,40 @@ static bool IVUseShouldUsePostIncValue(Instruction *User, Value *Operand,
   return true;
 }
 
-const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
-                                         const SCEV *S,
-                                         Instruction *User,
-                                         Value *OperandValToReplace,
-                                         PostIncLoopSet &Loops,
-                                         ScalarEvolution &SE,
-                                         DominatorTree &DT) {
-  if (isa<SCEVConstant>(S) || isa<SCEVUnknown>(S))
-    return S;
+namespace {
+
+/// Hold the state used during post-inc expression transformation, including a
+/// map of transformed expressions.
+class PostIncTransform {
+  TransformKind Kind;
+  PostIncLoopSet &Loops;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+
+  DenseMap<const SCEV*, const SCEV*> Transformed;
+
+public:
+  PostIncTransform(TransformKind kind, PostIncLoopSet &loops,
+                   ScalarEvolution &se, DominatorTree &dt):
+    Kind(kind), Loops(loops), SE(se), DT(dt) {}
+
+  const SCEV *TransformSubExpr(const SCEV *S, Instruction *User,
+                               Value *OperandValToReplace);
+
+protected:
+  const SCEV *TransformImpl(const SCEV *S, Instruction *User,
+                            Value *OperandValToReplace);
+};
+
+} // namespace
+
+/// Implement post-inc transformation for all valid expression types.
+const SCEV *PostIncTransform::
+TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
 
   if (const SCEVCastExpr *X = dyn_cast<SCEVCastExpr>(S)) {
     const SCEV *O = X->getOperand();
-    const SCEV *N = TransformForPostIncUse(Kind, O, User, OperandValToReplace,
-                                           Loops, SE, DT);
+    const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
     if (O != N)
       switch (S->getSCEVType()) {
       case scZeroExtend: return SE.getZeroExtendExpr(N, S->getType());
@@ -93,9 +113,7 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
     // Transform each operand.
     for (SCEVNAryExpr::op_iterator I = AR->op_begin(), E = AR->op_end();
          I != E; ++I) {
-      const SCEV *O = *I;
-      const SCEV *N = TransformForPostIncUse(Kind, O, LUser, 0, Loops, SE, DT);
-      Operands.push_back(N);
+      Operands.push_back(TransformSubExpr(*I, LUser, 0));
     }
     // Conservatively use AnyWrap until/unless we need FlagNW.
     const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
@@ -104,8 +122,8 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
     case NormalizeAutodetect:
       if (IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
         const SCEV *TransformedStep =
-          TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
-                                 User, OperandValToReplace, Loops, SE, DT);
+          TransformSubExpr(AR->getStepRecurrence(SE),
+                           User, OperandValToReplace);
         Result = SE.getMinusSCEV(Result, TransformedStep);
         Loops.insert(L);
       }
@@ -114,24 +132,20 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
       // sometimes fails to canonicalize two equal SCEVs to exactly the same
       // form. It's possibly a pessimization when this happens, but it isn't a
       // correctness problem, so disable this assert for now.
-      assert(S == TransformForPostIncUse(Denormalize, Result,
-                                         User, OperandValToReplace,
-                                         Loops, SE, DT) &&
+      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
              "SCEV normalization is not invertible!");
 #endif
       break;
     case Normalize:
       if (Loops.count(L)) {
         const SCEV *TransformedStep =
-          TransformForPostIncUse(Kind, AR->getStepRecurrence(SE),
-                                 User, OperandValToReplace, Loops, SE, DT);
+          TransformSubExpr(AR->getStepRecurrence(SE),
+                           User, OperandValToReplace);
         Result = SE.getMinusSCEV(Result, TransformedStep);
       }
 #if 0
       // See the comment on the assert above.
-      assert(S == TransformForPostIncUse(Denormalize, Result,
-                                         User, OperandValToReplace,
-                                         Loops, SE, DT) &&
+      assert(S == TransformSubExpr(Result, User, OperandValToReplace) &&
              "SCEV normalization is not invertible!");
 #endif
       break;
@@ -150,8 +164,7 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
     for (SCEVNAryExpr::op_iterator I = X->op_begin(), E = X->op_end();
          I != E; ++I) {
       const SCEV *O = *I;
-      const SCEV *N = TransformForPostIncUse(Kind, O, User, OperandValToReplace,
-                                             Loops, SE, DT);
+      const SCEV *N = TransformSubExpr(O, User, OperandValToReplace);
       Changed |= N != O;
       Operands.push_back(N);
     }
@@ -170,10 +183,8 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
   if (const SCEVUDivExpr *X = dyn_cast<SCEVUDivExpr>(S)) {
     const SCEV *LO = X->getLHS();
     const SCEV *RO = X->getRHS();
-    const SCEV *LN = TransformForPostIncUse(Kind, LO, User, OperandValToReplace,
-                                            Loops, SE, DT);
-    const SCEV *RN = TransformForPostIncUse(Kind, RO, User, OperandValToReplace,
-                                            Loops, SE, DT);
+    const SCEV *LN = TransformSubExpr(LO, User, OperandValToReplace);
+    const SCEV *RN = TransformSubExpr(RO, User, OperandValToReplace);
     if (LO != LN || RO != RN)
       return SE.getUDivExpr(LN, RN);
     return S;
@@ -182,3 +193,33 @@ const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
   llvm_unreachable("Unexpected SCEV kind!");
   return 0;
 }
+
+/// Manage recursive transformation across an expression DAG. Revisiting
+/// expressions would lead to exponential recursion.
+const SCEV *PostIncTransform::
+TransformSubExpr(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
+
+  if (isa<SCEVConstant>(S) || isa<SCEVUnknown>(S))
+    return S;
+
+  const SCEV *Result = Transformed.lookup(S);
+  if (Result)
+    return Result;
+
+  Result = TransformImpl(S, User, OperandValToReplace);
+  Transformed[S] = Result;
+  return Result;
+}
+
+/// Top level driver for transforming an expression DAG into its requested
+/// post-inc form (either "Normalized" or "Denormalized".
+const SCEV *llvm::TransformForPostIncUse(TransformKind Kind,
+                                         const SCEV *S,
+                                         Instruction *User,
+                                         Value *OperandValToReplace,
+                                         PostIncLoopSet &Loops,
+                                         ScalarEvolution &SE,
+                                         DominatorTree &DT) {
+  PostIncTransform Transform(Kind, Loops, SE, DT);
+  return Transform.TransformSubExpr(S, User, OperandValToReplace);
+}
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 455c910..4d94f61 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -34,7 +34,7 @@ const unsigned MaxDepth = 6;
 
 /// getBitWidth - Returns the bitwidth of the given scalar or pointer type (if
 /// unknown returns 0).  For vector types, returns the element type's bitwidth.
-static unsigned getBitWidth(const Type *Ty, const TargetData *TD) {
+static unsigned getBitWidth(Type *Ty, const TargetData *TD) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
   assert(isa<PointerType>(Ty) && "Expected a pointer type!");
@@ -103,7 +103,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
   if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     unsigned Align = GV->getAlignment();
     if (Align == 0 && TD && GV->getType()->getElementType()->isSized()) {
-      const Type *ObjectType = GV->getType()->getElementType();
+      Type *ObjectType = GV->getType()->getElementType();
       // If the object is defined in the current Module, we'll be giving
       // it the preferred alignment. Otherwise, we have to assume that it
       // may only have the minimum ABI alignment.
@@ -268,7 +268,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     // FALL THROUGH and handle them the same as zext/trunc.
   case Instruction::ZExt:
   case Instruction::Trunc: {
-    const Type *SrcTy = I->getOperand(0)->getType();
+    Type *SrcTy = I->getOperand(0)->getType();
     
     unsigned SrcBitWidth;
     // Note that we handle pointer operands here because of inttoptr/ptrtoint
@@ -291,7 +291,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     return;
   }
   case Instruction::BitCast: {
-    const Type *SrcTy = I->getOperand(0)->getType();
+    Type *SrcTy = I->getOperand(0)->getType();
     if ((SrcTy->isIntegerTy() || SrcTy->isPointerTy()) &&
         // TODO: For now, not handling conversions like:
         // (bitcast i64 %x to <2 x i32>)
@@ -559,7 +559,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
       Value *Index = I->getOperand(i);
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         // Handle struct member offset arithmetic.
         if (!TD) return;
         const StructLayout *SL = TD->getStructLayout(STy);
@@ -569,7 +569,7 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
                           CountTrailingZeros_64(Offset));
       } else {
         // Handle array index arithmetic.
-        const Type *IndexedTy = GTI.getIndexedType();
+        Type *IndexedTy = GTI.getIndexedType();
         if (!IndexedTy->isSized()) return;
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
         uint64_t TypeSize = TD ? TD->getTypeAllocSize(IndexedTy) : 1;
@@ -898,7 +898,7 @@ unsigned llvm::ComputeNumSignBits(Value *V, const TargetData *TD,
   assert((TD || V->getType()->isIntOrIntVectorTy()) &&
          "ComputeNumSignBits requires a TargetData object to operate "
          "on non-integer values!");
-  const Type *Ty = V->getType();
+  Type *Ty = V->getType();
   unsigned TyBits = TD ? TD->getTypeSizeInBits(V->getType()->getScalarType()) :
                          Ty->getScalarSizeInBits();
   unsigned Tmp, Tmp2;
@@ -1078,7 +1078,7 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
   assert(Depth <= MaxDepth && "Limit Search Depth");
   assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");
 
-  const Type *T = V->getType();
+  Type *T = V->getType();
 
   ConstantInt *CI = dyn_cast<ConstantInt>(V);
 
@@ -1315,11 +1315,11 @@ Value *llvm::isBytewiseValue(Value *V) {
 // indices from Idxs that should be left out when inserting into the resulting
 // struct. To is the result struct built so far, new insertvalue instructions
 // build on that.
-static Value *BuildSubAggregate(Value *From, Value* To, const Type *IndexedType,
+static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
                                 SmallVector<unsigned, 10> &Idxs,
                                 unsigned IdxSkip,
                                 Instruction *InsertBefore) {
-  const llvm::StructType *STy = llvm::dyn_cast<llvm::StructType>(IndexedType);
+  llvm::StructType *STy = llvm::dyn_cast<llvm::StructType>(IndexedType);
   if (STy) {
     // Save the original To argument so we can modify it
     Value *OrigTo = To;
@@ -1358,8 +1358,7 @@ static Value *BuildSubAggregate(Value *From, Value* To, const Type *IndexedType,
     return NULL;
 
   // Insert the value in the new (sub) aggregrate
-  return llvm::InsertValueInst::Create(To, V,
-                                       ArrayRef<unsigned>(Idxs).slice(IdxSkip),
+  return llvm::InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
                                        "tmp", InsertBefore);
 }
 
@@ -1378,7 +1377,7 @@ static Value *BuildSubAggregate(Value *From, Value* To, const Type *IndexedType,
 static Value *BuildSubAggregate(Value *From, ArrayRef<unsigned> idx_range,
                                 Instruction *InsertBefore) {
   assert(InsertBefore && "Must have someplace to insert!");
-  const Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
+  Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
                                                              idx_range);
   Value *To = UndefValue::get(IndexedType);
   SmallVector<unsigned, 10> Idxs(idx_range.begin(), idx_range.end());
@@ -1404,7 +1403,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
          && "Not looking at a struct or array?");
   assert(ExtractValueInst::getIndexedType(V->getType(), idx_range)
          && "Invalid indices for type?");
-  const CompositeType *PTy = cast<CompositeType>(V->getType());
+  CompositeType *PTy = cast<CompositeType>(V->getType());
 
   if (isa<UndefValue>(V))
     return UndefValue::get(ExtractValueInst::getIndexedType(PTy,
@@ -1435,9 +1434,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
           // %C = insertvalue {i32, i32 } %A, i32 11, 1
           // which allows the unused 0,0 element from the nested struct to be
           // removed.
-          return BuildSubAggregate(V,
-                                   ArrayRef<unsigned>(idx_range.begin(),
-                                                      req_idx),
+          return BuildSubAggregate(V, makeArrayRef(idx_range.begin(), req_idx),
                                    InsertBefore);
         else
           // We can't handle this without inserting insertvalues
@@ -1455,7 +1452,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
     // requested (though possibly only partially). Now we recursively look at
     // the inserted value, passing any remaining indices.
     return FindInsertedValue(I->getInsertedValueOperand(),
-                             ArrayRef<unsigned>(req_idx, idx_range.end()),
+                             makeArrayRef(req_idx, idx_range.end()),
                              InsertBefore);
   } else if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
     // If we're extracting a value from an aggregrate that was extracted from
@@ -1506,7 +1503,7 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
     if (OpC->isZero()) continue;
     
     // Handle a struct and array indices which add their offset to the pointer.
-    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
       Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
     } else {
       uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
@@ -1557,8 +1554,8 @@ bool llvm::GetConstantStringInfo(const Value *V, std::string &Str,
       return false;
     
     // Make sure the index-ee is a pointer to array of i8.
-    const PointerType *PT = cast<PointerType>(GEP->getOperand(0)->getType());
-    const ArrayType *AT = dyn_cast<ArrayType>(PT->getElementType());
+    PointerType *PT = cast<PointerType>(GEP->getOperand(0)->getType());
+    ArrayType *AT = dyn_cast<ArrayType>(PT->getElementType());
     if (AT == 0 || !AT->getElementType()->isIntegerTy(8))
       return false;
     
diff --git a/lib/Archive/CMakeLists.txt b/lib/Archive/CMakeLists.txt
index 7ff478a..b52974e 100644
--- a/lib/Archive/CMakeLists.txt
+++ b/lib/Archive/CMakeLists.txt
@@ -3,3 +3,9 @@ add_llvm_library(LLVMArchive
   ArchiveReader.cpp
   ArchiveWriter.cpp
   )
+
+add_llvm_library_dependencies(LLVMArchive
+  LLVMBitReader
+  LLVMCore
+  LLVMSupport
+  )
diff --git a/lib/AsmParser/CMakeLists.txt b/lib/AsmParser/CMakeLists.txt
index 985ebe2..7496015 100644
--- a/lib/AsmParser/CMakeLists.txt
+++ b/lib/AsmParser/CMakeLists.txt
@@ -4,3 +4,8 @@ add_llvm_library(LLVMAsmParser
   LLParser.cpp
   Parser.cpp
   )
+
+add_llvm_library_dependencies(LLVMAsmParser
+  LLVMCore
+  LLVMSupport
+  )
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 3c63106..d0dd986 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -506,6 +506,15 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(deplibs);
   KEYWORD(datalayout);
   KEYWORD(volatile);
+  KEYWORD(atomic);
+  KEYWORD(unordered);
+  KEYWORD(monotonic);
+  KEYWORD(acquire);
+  KEYWORD(release);
+  KEYWORD(acq_rel);
+  KEYWORD(seq_cst);
+  KEYWORD(singlethread);
+
   KEYWORD(nuw);
   KEYWORD(nsw);
   KEYWORD(exact);
@@ -549,6 +558,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(readnone);
   KEYWORD(readonly);
   KEYWORD(uwtable);
+  KEYWORD(returns_twice);
 
   KEYWORD(inlinehint);
   KEYWORD(noinline);
@@ -559,7 +569,6 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noredzone);
   KEYWORD(noimplicitfloat);
   KEYWORD(naked);
-  KEYWORD(hotpatch);
   KEYWORD(nonlazybind);
 
   KEYWORD(type);
@@ -570,8 +579,16 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(oeq); KEYWORD(one); KEYWORD(olt); KEYWORD(ogt); KEYWORD(ole);
   KEYWORD(oge); KEYWORD(ord); KEYWORD(uno); KEYWORD(ueq); KEYWORD(une);
 
+  KEYWORD(xchg); KEYWORD(nand); KEYWORD(max); KEYWORD(min); KEYWORD(umax);
+  KEYWORD(umin);
+
   KEYWORD(x);
   KEYWORD(blockaddress);
+
+  KEYWORD(personality);
+  KEYWORD(cleanup);
+  KEYWORD(catch);
+  KEYWORD(filter);
 #undef KEYWORD
 
   // Keywords for types.
@@ -624,12 +641,16 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(switch,      Switch);
   INSTKEYWORD(indirectbr,  IndirectBr);
   INSTKEYWORD(invoke,      Invoke);
+  INSTKEYWORD(resume,      Resume);
   INSTKEYWORD(unwind,      Unwind);
   INSTKEYWORD(unreachable, Unreachable);
 
   INSTKEYWORD(alloca,      Alloca);
   INSTKEYWORD(load,        Load);
   INSTKEYWORD(store,       Store);
+  INSTKEYWORD(cmpxchg,     AtomicCmpXchg);
+  INSTKEYWORD(atomicrmw,   AtomicRMW);
+  INSTKEYWORD(fence,       Fence);
   INSTKEYWORD(getelementptr, GetElementPtr);
 
   INSTKEYWORD(extractelement, ExtractElement);
@@ -637,6 +658,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(shufflevector,  ShuffleVector);
   INSTKEYWORD(extractvalue,   ExtractValue);
   INSTKEYWORD(insertvalue,    InsertValue);
+  INSTKEYWORD(landingpad,     LandingPad);
 #undef INSTKEYWORD
 
   // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by
@@ -704,17 +726,17 @@ lltok::Kind LLLexer::Lex0x() {
   case 'K':
     // F80HexFPConstant - x87 long double in hexadecimal format (10 bytes)
     FP80HexToIntPair(TokStart+3, CurPtr, Pair);
-    APFloatVal = APFloat(APInt(80, 2, Pair));
+    APFloatVal = APFloat(APInt(80, Pair));
     return lltok::APFloat;
   case 'L':
     // F128HexFPConstant - IEEE 128-bit in hexadecimal format (16 bytes)
     HexToIntPair(TokStart+3, CurPtr, Pair);
-    APFloatVal = APFloat(APInt(128, 2, Pair), true);
+    APFloatVal = APFloat(APInt(128, Pair), true);
     return lltok::APFloat;
   case 'M':
     // PPC128HexFPConstant - PowerPC 128-bit in hexadecimal format (16 bytes)
     HexToIntPair(TokStart+3, CurPtr, Pair);
-    APFloatVal = APFloat(APInt(128, 2, Pair));
+    APFloatVal = APFloat(APInt(128, Pair));
     return lltok::APFloat;
   }
 }
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index cfc31f3..cafaab0 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -26,7 +26,7 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-static std::string getTypeString(const Type *T) {
+static std::string getTypeString(Type *T) {
   std::string Result;
   raw_string_ostream Tmp(Result);
   Tmp << *T;
@@ -120,6 +120,9 @@ bool LLParser::ValidateEndOfModule() {
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
     UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove
 
+  // Upgrade to new EH scheme. N.B. This will go away in 3.1.
+  UpgradeExceptionHandling(M);
+
   // Check debug info intrinsics.
   CheckDebugInfoIntrinsics(M);
   return false;
@@ -744,9 +747,9 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
 /// GetGlobalVal - Get a value with the specified name or ID, creating a
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
-GlobalValue *LLParser::GetGlobalVal(const std::string &Name, const Type *Ty,
+GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
                                     LocTy Loc) {
-  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+  PointerType *PTy = dyn_cast<PointerType>(Ty);
   if (PTy == 0) {
     Error(Loc, "global variable reference must have pointer type");
     return 0;
@@ -775,7 +778,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, const Type *Ty,
 
   // Otherwise, create a new forward reference for this value and remember it.
   GlobalValue *FwdVal;
-  if (const FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
+  if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
     FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M);
   else
     FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
@@ -785,8 +788,8 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, const Type *Ty,
   return FwdVal;
 }
 
-GlobalValue *LLParser::GetGlobalVal(unsigned ID, const Type *Ty, LocTy Loc) {
-  const PointerType *PTy = dyn_cast<PointerType>(Ty);
+GlobalValue *LLParser::GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc) {
+  PointerType *PTy = dyn_cast<PointerType>(Ty);
   if (PTy == 0) {
     Error(Loc, "global variable reference must have pointer type");
     return 0;
@@ -813,7 +816,7 @@ GlobalValue *LLParser::GetGlobalVal(unsigned ID, const Type *Ty, LocTy Loc) {
 
   // Otherwise, create a new forward reference for this value and remember it.
   GlobalValue *FwdVal;
-  if (const FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
+  if (FunctionType *FT = dyn_cast<FunctionType>(PTy->getElementType()))
     FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, "", M);
   else
     FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
@@ -908,6 +911,7 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
     case lltok::kw_noreturn:        Attrs |= Attribute::NoReturn; break;
     case lltok::kw_nounwind:        Attrs |= Attribute::NoUnwind; break;
     case lltok::kw_uwtable:         Attrs |= Attribute::UWTable; break;
+    case lltok::kw_returns_twice:   Attrs |= Attribute::ReturnsTwice; break;
     case lltok::kw_noinline:        Attrs |= Attribute::NoInline; break;
     case lltok::kw_readnone:        Attrs |= Attribute::ReadNone; break;
     case lltok::kw_readonly:        Attrs |= Attribute::ReadOnly; break;
@@ -919,7 +923,6 @@ bool LLParser::ParseOptionalAttrs(unsigned &Attrs, unsigned AttrKind) {
     case lltok::kw_noredzone:       Attrs |= Attribute::NoRedZone; break;
     case lltok::kw_noimplicitfloat: Attrs |= Attribute::NoImplicitFloat; break;
     case lltok::kw_naked:           Attrs |= Attribute::Naked; break;
-    case lltok::kw_hotpatch:        Attrs |= Attribute::Hotpatch; break;
     case lltok::kw_nonlazybind:     Attrs |= Attribute::NonLazyBind; break;
 
     case lltok::kw_alignstack: {
@@ -1145,6 +1148,32 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
   return false;
 }
 
+/// ParseScopeAndOrdering
+///   if isAtomic: ::= 'singlethread'? AtomicOrdering
+///   else: ::=
+///
+/// This sets Scope and Ordering to the parsed values.
+bool LLParser::ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
+                                     AtomicOrdering &Ordering) {
+  if (!isAtomic)
+    return false;
+
+  Scope = CrossThread;
+  if (EatIfPresent(lltok::kw_singlethread))
+    Scope = SingleThread;
+  switch (Lex.getKind()) {
+  default: return TokError("Expected ordering on atomic instruction");
+  case lltok::kw_unordered: Ordering = Unordered; break;
+  case lltok::kw_monotonic: Ordering = Monotonic; break;
+  case lltok::kw_acquire: Ordering = Acquire; break;
+  case lltok::kw_release: Ordering = Release; break;
+  case lltok::kw_acq_rel: Ordering = AcquireRelease; break;
+  case lltok::kw_seq_cst: Ordering = SequentiallyConsistent; break;
+  }
+  Lex.Lex();
+  return false;
+}
+
 /// ParseOptionalStackAlignment
 ///   ::= /* empty */
 ///   ::= 'alignstack' '(' 4 ')'
@@ -1237,7 +1266,7 @@ bool LLParser::ParseType(Type *&Result, bool AllowVoid) {
     // If the type hasn't been defined yet, create a forward definition and
     // remember where that forward def'n was seen (in case it never is defined).
     if (Entry.first == 0) {
-      Entry.first = StructType::createNamed(Context, Lex.getStrVal());
+      Entry.first = StructType::create(Context, Lex.getStrVal());
       Entry.second = Lex.getLoc();
     }
     Result = Entry.first;
@@ -1254,7 +1283,7 @@ bool LLParser::ParseType(Type *&Result, bool AllowVoid) {
     // If the type hasn't been defined yet, create a forward definition and
     // remember where that forward def'n was seen (in case it never is defined).
     if (Entry.first == 0) {
-      Entry.first = StructType::createNamed(Context, "");
+      Entry.first = StructType::create(Context);
       Entry.second = Lex.getLoc();
     }
     Result = Entry.first;
@@ -1476,7 +1505,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
     
     // If this type number has never been uttered, create it.
     if (Entry.first == 0)
-      Entry.first = StructType::createNamed(Context, Name);
+      Entry.first = StructType::create(Context, Name);
     ResultTy = Entry.first;
     return false;
   }
@@ -1502,7 +1531,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
   
   // If this type number has never been uttered, create it.
   if (Entry.first == 0)
-    Entry.first = StructType::createNamed(Context, Name);
+    Entry.first = StructType::create(Context, Name);
   
   StructType *STy = cast<StructType>(Entry.first);
  
@@ -1668,7 +1697,7 @@ bool LLParser::PerFunctionState::FinishFunction() {
 /// forward reference record if needed.  This can return null if the value
 /// exists but does not have the right type.
 Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
-                                          const Type *Ty, LocTy Loc) {
+                                          Type *Ty, LocTy Loc) {
   // Look this name up in the normal function symbol table.
   Value *Val = F.getValueSymbolTable().lookup(Name);
 
@@ -1709,7 +1738,7 @@ Value *LLParser::PerFunctionState::GetVal(const std::string &Name,
   return FwdVal;
 }
 
-Value *LLParser::PerFunctionState::GetVal(unsigned ID, const Type *Ty,
+Value *LLParser::PerFunctionState::GetVal(unsigned ID, Type *Ty,
                                           LocTy Loc) {
   // Look this name up in the normal function symbol table.
   Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : 0;
@@ -2273,16 +2302,11 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       if (Elts.size() == 0 || !Elts[0]->getType()->isPointerTy())
         return Error(ID.Loc, "getelementptr requires pointer operand");
 
-      if (!GetElementPtrInst::getIndexedType(Elts[0]->getType(),
-                                             (Value**)(Elts.data() + 1),
-                                             Elts.size() - 1))
+      ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
+      if (!GetElementPtrInst::getIndexedType(Elts[0]->getType(), Indices))
         return Error(ID.Loc, "invalid indices for getelementptr");
-      ID.ConstantVal = InBounds ?
-        ConstantExpr::getInBoundsGetElementPtr(Elts[0],
-                                               Elts.data() + 1,
-                                               Elts.size() - 1) :
-        ConstantExpr::getGetElementPtr(Elts[0],
-                                       Elts.data() + 1, Elts.size() - 1);
+      ID.ConstantVal = ConstantExpr::getGetElementPtr(Elts[0], Indices,
+                                                      InBounds);
     } else if (Opc == Instruction::Select) {
       if (Elts.size() != 3)
         return Error(ID.Loc, "expected three operands to select");
@@ -2323,7 +2347,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 }
 
 /// ParseGlobalValue - Parse a global value with the specified type.
-bool LLParser::ParseGlobalValue(const Type *Ty, Constant *&C) {
+bool LLParser::ParseGlobalValue(Type *Ty, Constant *&C) {
   C = 0;
   ValID ID;
   Value *V = NULL;
@@ -2410,7 +2434,7 @@ bool LLParser::ParseMetadataValue(ValID &ID, PerFunctionState *PFS) {
 // Function Parsing.
 //===----------------------------------------------------------------------===//
 
-bool LLParser::ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
+bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
                                    PerFunctionState *PFS) {
   if (Ty->isFunctionTy())
     return Error(ID.Loc, "functions are not values, refer to them as pointers");
@@ -2426,8 +2450,8 @@ bool LLParser::ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
     V = PFS->GetVal(ID.StrVal, Ty, ID.Loc);
     return (V == 0);
   case ValID::t_InlineAsm: {
-    const PointerType *PTy = dyn_cast<PointerType>(Ty);
-    const FunctionType *FTy = 
+    PointerType *PTy = dyn_cast<PointerType>(Ty);
+    FunctionType *FTy = 
       PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : 0;
     if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2))
       return Error(ID.Loc, "invalid type for inline asm constraint string");
@@ -2506,7 +2530,7 @@ bool LLParser::ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
     return false;
   case ValID::t_ConstantStruct:
   case ValID::t_PackedConstantStruct:
-    if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+    if (StructType *ST = dyn_cast<StructType>(Ty)) {
       if (ST->getNumElements() != ID.UIntVal)
         return Error(ID.Loc,
                      "initializer with struct type has wrong # elements");
@@ -2519,15 +2543,15 @@ bool LLParser::ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
           return Error(ID.Loc, "element " + Twine(i) +
                     " of struct initializer doesn't match struct element type");
       
-      V = ConstantStruct::get(ST, ArrayRef<Constant*>(ID.ConstantStructElts,
-                                                      ID.UIntVal));
+      V = ConstantStruct::get(ST, makeArrayRef(ID.ConstantStructElts,
+                                               ID.UIntVal));
     } else
       return Error(ID.Loc, "constant expression type mismatch");
     return false;
   }
 }
 
-bool LLParser::ParseValue(const Type *Ty, Value *&V, PerFunctionState *PFS) {
+bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
   V = 0;
   ValID ID;
   return ParseValID(ID, PFS) ||
@@ -2671,9 +2695,9 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   if (PAL.paramHasAttr(1, Attribute::StructRet) && !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
 
-  const FunctionType *FT =
+  FunctionType *FT =
     FunctionType::get(RetType, ParamTypeList, isVarArg);
-  const PointerType *PFT = PointerType::getUnqual(FT);
+  PointerType *PFT = PointerType::getUnqual(FT);
 
   Fn = 0;
   if (!FunctionName.empty()) {
@@ -2864,6 +2888,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_switch:      return ParseSwitch(Inst, PFS);
   case lltok::kw_indirectbr:  return ParseIndirectBr(Inst, PFS);
   case lltok::kw_invoke:      return ParseInvoke(Inst, PFS);
+  case lltok::kw_resume:      return ParseResume(Inst, PFS);
   // Binary Operators.
   case lltok::kw_add:
   case lltok::kw_sub:
@@ -2923,13 +2948,18 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_insertelement:  return ParseInsertElement(Inst, PFS);
   case lltok::kw_shufflevector:  return ParseShuffleVector(Inst, PFS);
   case lltok::kw_phi:            return ParsePHI(Inst, PFS);
+  case lltok::kw_landingpad:     return ParseLandingPad(Inst, PFS);
   case lltok::kw_call:           return ParseCall(Inst, PFS, false);
   case lltok::kw_tail:           return ParseCall(Inst, PFS, true);
   // Memory.
   case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
   case lltok::kw_load:           return ParseLoad(Inst, PFS, false);
   case lltok::kw_store:          return ParseStore(Inst, PFS, false);
+  case lltok::kw_cmpxchg:        return ParseCmpXchg(Inst, PFS);
+  case lltok::kw_atomicrmw:      return ParseAtomicRMW(Inst, PFS);
+  case lltok::kw_fence:          return ParseFence(Inst, PFS);
   case lltok::kw_volatile:
+    // For compatibility; canonical location is after load
     if (EatIfPresent(lltok::kw_load))
       return ParseLoad(Inst, PFS, true);
     else if (EatIfPresent(lltok::kw_store))
@@ -3162,8 +3192,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   // If RetType is a non-function pointer type, then this is the short syntax
   // for the call, which means that RetType is just the return type.  Infer the
   // rest of the function argument types from the arguments that are present.
-  const PointerType *PFTy = 0;
-  const FunctionType *Ty = 0;
+  PointerType *PFTy = 0;
+  FunctionType *Ty = 0;
   if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
       !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
     // Pull out the types of all of the arguments...
@@ -3194,7 +3224,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   FunctionType::param_iterator I = Ty->param_begin();
   FunctionType::param_iterator E = Ty->param_end();
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
-    const Type *ExpectedTy = 0;
+    Type *ExpectedTy = 0;
     if (I != E) {
       ExpectedTy = *I++;
     } else if (!Ty->isVarArg()) {
@@ -3225,7 +3255,17 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
+/// ParseResume
+///   ::= 'resume' TypeAndValue
+bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Exn; LocTy ExnLoc;
+  if (ParseTypeAndValue(Exn, ExnLoc, PFS))
+    return true;
 
+  ResumeInst *RI = ResumeInst::Create(Exn);
+  Inst = RI;
+  return false;
+}
 
 //===----------------------------------------------------------------------===//
 // Binary Operators.
@@ -3473,6 +3513,56 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
+/// ParseLandingPad
+///   ::= 'landingpad' Type 'personality' TypeAndValue 'cleanup'? Clause+
+/// Clause
+///   ::= 'catch' TypeAndValue
+///   ::= 'filter'
+///   ::= 'filter' TypeAndValue ( ',' TypeAndValue )*
+bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
+  Type *Ty = 0; LocTy TyLoc;
+  Value *PersFn; LocTy PersFnLoc;
+
+  if (ParseType(Ty, TyLoc) ||
+      ParseToken(lltok::kw_personality, "expected 'personality'") ||
+      ParseTypeAndValue(PersFn, PersFnLoc, PFS))
+    return true;
+
+  LandingPadInst *LP = LandingPadInst::Create(Ty, PersFn, 0);
+  LP->setCleanup(EatIfPresent(lltok::kw_cleanup));
+
+  while (Lex.getKind() == lltok::kw_catch || Lex.getKind() == lltok::kw_filter){
+    LandingPadInst::ClauseType CT;
+    if (EatIfPresent(lltok::kw_catch))
+      CT = LandingPadInst::Catch;
+    else if (EatIfPresent(lltok::kw_filter))
+      CT = LandingPadInst::Filter;
+    else
+      return TokError("expected 'catch' or 'filter' clause type");
+
+    Value *V; LocTy VLoc;
+    if (ParseTypeAndValue(V, VLoc, PFS)) {
+      delete LP;
+      return true;
+    }
+
+    // A 'catch' type expects a non-array constant. A filter clause expects an
+    // array constant.
+    if (CT == LandingPadInst::Catch) {
+      if (isa<ArrayType>(V->getType()))
+        Error(VLoc, "'catch' clause has an invalid type");
+    } else {
+      if (!isa<ArrayType>(V->getType()))
+        Error(VLoc, "'filter' clause has an invalid type");
+    }
+
+    LP->addClause(V);
+  }
+
+  Inst = LP;
+  return false;
+}
+
 /// ParseCall
 ///   ::= 'tail'? 'call' OptionalCallingConv OptionalAttrs Type Value
 ///       ParameterList OptionalAttrs
@@ -3498,8 +3588,8 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   // If RetType is a non-function pointer type, then this is the short syntax
   // for the call, which means that RetType is just the return type.  Infer the
   // rest of the function argument types from the arguments that are present.
-  const PointerType *PFTy = 0;
-  const FunctionType *Ty = 0;
+  PointerType *PFTy = 0;
+  FunctionType *Ty = 0;
   if (!(PFTy = dyn_cast<PointerType>(RetType)) ||
       !(Ty = dyn_cast<FunctionType>(PFTy->getElementType()))) {
     // Pull out the types of all of the arguments...
@@ -3530,7 +3620,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   FunctionType::param_iterator I = Ty->param_begin();
   FunctionType::param_iterator E = Ty->param_end();
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
-    const Type *ExpectedTy = 0;
+    Type *ExpectedTy = 0;
     if (I != E) {
       ExpectedTy = *I++;
     } else if (!Ty->isVarArg()) {
@@ -3596,34 +3686,85 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
 }
 
 /// ParseLoad
-///   ::= 'volatile'? 'load' TypeAndValue (',' OptionalInfo)?
+///   ::= 'load' 'volatile'? TypeAndValue (',' 'align' i32)?
+///   ::= 'load' 'atomic' 'volatile'? TypeAndValue 
+///       'singlethread'? AtomicOrdering (',' 'align' i32)?
+///   Compatibility:
+///   ::= 'volatile' 'load' TypeAndValue (',' 'align' i32)?
 int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS,
                         bool isVolatile) {
   Value *Val; LocTy Loc;
   unsigned Alignment = 0;
   bool AteExtraComma = false;
+  bool isAtomic = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+
+  if (Lex.getKind() == lltok::kw_atomic) {
+    if (isVolatile)
+      return TokError("mixing atomic with old volatile placement");
+    isAtomic = true;
+    Lex.Lex();
+  }
+
+  if (Lex.getKind() == lltok::kw_volatile) {
+    if (isVolatile)
+      return TokError("duplicate volatile before and after store");
+    isVolatile = true;
+    Lex.Lex();
+  }
+
   if (ParseTypeAndValue(Val, Loc, PFS) ||
+      ParseScopeAndOrdering(isAtomic, Scope, Ordering) ||
       ParseOptionalCommaAlign(Alignment, AteExtraComma))
     return true;
 
   if (!Val->getType()->isPointerTy() ||
       !cast<PointerType>(Val->getType())->getElementType()->isFirstClassType())
     return Error(Loc, "load operand must be a pointer to a first class type");
+  if (isAtomic && !Alignment)
+    return Error(Loc, "atomic load must have explicit non-zero alignment");
+  if (Ordering == Release || Ordering == AcquireRelease)
+    return Error(Loc, "atomic load cannot use Release ordering");
 
-  Inst = new LoadInst(Val, "", isVolatile, Alignment);
+  Inst = new LoadInst(Val, "", isVolatile, Alignment, Ordering, Scope);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
 /// ParseStore
-///   ::= 'volatile'? 'store' TypeAndValue ',' TypeAndValue (',' 'align' i32)?
+
+///   ::= 'store' 'volatile'? TypeAndValue ',' TypeAndValue (',' 'align' i32)?
+///   ::= 'store' 'atomic' 'volatile'? TypeAndValue ',' TypeAndValue
+///       'singlethread'? AtomicOrdering (',' 'align' i32)?
+///   Compatibility:
+///   ::= 'volatile' 'store' TypeAndValue ',' TypeAndValue (',' 'align' i32)?
 int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS,
                          bool isVolatile) {
   Value *Val, *Ptr; LocTy Loc, PtrLoc;
   unsigned Alignment = 0;
   bool AteExtraComma = false;
+  bool isAtomic = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+
+  if (Lex.getKind() == lltok::kw_atomic) {
+    if (isVolatile)
+      return TokError("mixing atomic with old volatile placement");
+    isAtomic = true;
+    Lex.Lex();
+  }
+
+  if (Lex.getKind() == lltok::kw_volatile) {
+    if (isVolatile)
+      return TokError("duplicate volatile before and after store");
+    isVolatile = true;
+    Lex.Lex();
+  }
+
   if (ParseTypeAndValue(Val, Loc, PFS) ||
       ParseToken(lltok::comma, "expected ',' after store operand") ||
       ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      ParseScopeAndOrdering(isAtomic, Scope, Ordering) ||
       ParseOptionalCommaAlign(Alignment, AteExtraComma))
     return true;
 
@@ -3633,11 +3774,131 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS,
     return Error(Loc, "store operand must be a first class value");
   if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
     return Error(Loc, "stored value and pointer type do not match");
+  if (isAtomic && !Alignment)
+    return Error(Loc, "atomic store must have explicit non-zero alignment");
+  if (Ordering == Acquire || Ordering == AcquireRelease)
+    return Error(Loc, "atomic store cannot use Acquire ordering");
+
+  Inst = new StoreInst(Val, Ptr, isVolatile, Alignment, Ordering, Scope);
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
 
-  Inst = new StoreInst(Val, Ptr, isVolatile, Alignment);
+/// ParseCmpXchg
+///   ::= 'cmpxchg' 'volatile'? TypeAndValue ',' TypeAndValue ',' TypeAndValue
+///       'singlethread'? AtomicOrdering
+int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc;
+  bool AteExtraComma = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+  bool isVolatile = false;
+
+  if (EatIfPresent(lltok::kw_volatile))
+    isVolatile = true;
+
+  if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after cmpxchg address") ||
+      ParseTypeAndValue(Cmp, CmpLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after cmpxchg cmp operand") ||
+      ParseTypeAndValue(New, NewLoc, PFS) ||
+      ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+    return true;
+
+  if (Ordering == Unordered)
+    return TokError("cmpxchg cannot be unordered");
+  if (!Ptr->getType()->isPointerTy())
+    return Error(PtrLoc, "cmpxchg operand must be a pointer");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != Cmp->getType())
+    return Error(CmpLoc, "compare value and pointer type do not match");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != New->getType())
+    return Error(NewLoc, "new value and pointer type do not match");
+  if (!New->getType()->isIntegerTy())
+    return Error(NewLoc, "cmpxchg operand must be an integer");
+  unsigned Size = New->getType()->getPrimitiveSizeInBits();
+  if (Size < 8 || (Size & (Size - 1)))
+    return Error(NewLoc, "cmpxchg operand must be power-of-two byte-sized"
+                         " integer");
+
+  AtomicCmpXchgInst *CXI =
+    new AtomicCmpXchgInst(Ptr, Cmp, New, Ordering, Scope);
+  CXI->setVolatile(isVolatile);
+  Inst = CXI;
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
+/// ParseAtomicRMW
+///   ::= 'atomicrmw' 'volatile'? BinOp TypeAndValue ',' TypeAndValue
+///       'singlethread'? AtomicOrdering
+int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
+  Value *Ptr, *Val; LocTy PtrLoc, ValLoc;
+  bool AteExtraComma = false;
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+  bool isVolatile = false;
+  AtomicRMWInst::BinOp Operation;
+
+  if (EatIfPresent(lltok::kw_volatile))
+    isVolatile = true;
+
+  switch (Lex.getKind()) {
+  default: return TokError("expected binary operation in atomicrmw");
+  case lltok::kw_xchg: Operation = AtomicRMWInst::Xchg; break;
+  case lltok::kw_add: Operation = AtomicRMWInst::Add; break;
+  case lltok::kw_sub: Operation = AtomicRMWInst::Sub; break;
+  case lltok::kw_and: Operation = AtomicRMWInst::And; break;
+  case lltok::kw_nand: Operation = AtomicRMWInst::Nand; break;
+  case lltok::kw_or: Operation = AtomicRMWInst::Or; break;
+  case lltok::kw_xor: Operation = AtomicRMWInst::Xor; break;
+  case lltok::kw_max: Operation = AtomicRMWInst::Max; break;
+  case lltok::kw_min: Operation = AtomicRMWInst::Min; break;
+  case lltok::kw_umax: Operation = AtomicRMWInst::UMax; break;
+  case lltok::kw_umin: Operation = AtomicRMWInst::UMin; break;
+  }
+  Lex.Lex();  // Eat the operation.
+
+  if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after atomicrmw address") ||
+      ParseTypeAndValue(Val, ValLoc, PFS) ||
+      ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+    return true;
+
+  if (Ordering == Unordered)
+    return TokError("atomicrmw cannot be unordered");
+  if (!Ptr->getType()->isPointerTy())
+    return Error(PtrLoc, "atomicrmw operand must be a pointer");
+  if (cast<PointerType>(Ptr->getType())->getElementType() != Val->getType())
+    return Error(ValLoc, "atomicrmw value and pointer type do not match");
+  if (!Val->getType()->isIntegerTy())
+    return Error(ValLoc, "atomicrmw operand must be an integer");
+  unsigned Size = Val->getType()->getPrimitiveSizeInBits();
+  if (Size < 8 || (Size & (Size - 1)))
+    return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
+                         " integer");
+
+  AtomicRMWInst *RMWI =
+    new AtomicRMWInst(Operation, Ptr, Val, Ordering, Scope);
+  RMWI->setVolatile(isVolatile);
+  Inst = RMWI;
+  return AteExtraComma ? InstExtraComma : InstNormal;
+}
+
+/// ParseFence
+///   ::= 'fence' 'singlethread'? AtomicOrdering
+int LLParser::ParseFence(Instruction *&Inst, PerFunctionState &PFS) {
+  AtomicOrdering Ordering = NotAtomic;
+  SynchronizationScope Scope = CrossThread;
+  if (ParseScopeAndOrdering(true /*Always atomic*/, Scope, Ordering))
+    return true;
+
+  if (Ordering == Unordered)
+    return TokError("fence cannot be unordered");
+  if (Ordering == Monotonic)
+    return TokError("fence cannot be monotonic");
+
+  Inst = new FenceInst(Context, Ordering, Scope);
+  return InstNormal;
+}
+
 /// ParseGetElementPtr
 ///   ::= 'getelementptr' 'inbounds'? TypeAndValue (',' TypeAndValue)*
 int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
@@ -3663,10 +3924,9 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
     Indices.push_back(Val);
   }
 
-  if (!GetElementPtrInst::getIndexedType(Ptr->getType(),
-                                         Indices.begin(), Indices.end()))
+  if (!GetElementPtrInst::getIndexedType(Ptr->getType(), Indices))
     return Error(Loc, "invalid getelementptr indices");
-  Inst = GetElementPtrInst::Create(Ptr, Indices.begin(), Indices.end());
+  Inst = GetElementPtrInst::Create(Ptr, Indices);
   if (InBounds)
     cast<GetElementPtrInst>(Inst)->setIsInBounds(true);
   return AteExtraComma ? InstExtraComma : InstNormal;
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 9630657..cbc3c23 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -15,6 +15,7 @@
 #define LLVM_ASMPARSER_LLPARSER_H
 
 #include "LLLexer.h"
+#include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
@@ -142,8 +143,8 @@ namespace llvm {
     /// GetGlobalVal - Get a value with the specified name or ID, creating a
     /// forward reference record if needed.  This can return null if the value
     /// exists but does not have the right type.
-    GlobalValue *GetGlobalVal(const std::string &N, const Type *Ty, LocTy Loc);
-    GlobalValue *GetGlobalVal(unsigned ID, const Type *Ty, LocTy Loc);
+    GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc);
+    GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc);
 
     // Helper Routines.
     bool ParseToken(lltok::Kind T, const char *ErrMsg);
@@ -178,6 +179,8 @@ namespace llvm {
     bool ParseOptionalVisibility(unsigned &Visibility);
     bool ParseOptionalCallingConv(CallingConv::ID &CC);
     bool ParseOptionalAlignment(unsigned &Alignment);
+    bool ParseScopeAndOrdering(bool isAtomic, SynchronizationScope &Scope,
+                               AtomicOrdering &Ordering);
     bool ParseOptionalStackAlignment(unsigned &Alignment);
     bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
     bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,bool &AteExtraComma);
@@ -249,8 +252,8 @@ namespace llvm {
       /// GetVal - Get a value with the specified name or ID, creating a
       /// forward reference record if needed.  This can return null if the value
       /// exists but does not have the right type.
-      Value *GetVal(const std::string &Name, const Type *Ty, LocTy Loc);
-      Value *GetVal(unsigned ID, const Type *Ty, LocTy Loc);
+      Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc);
+      Value *GetVal(unsigned ID, Type *Ty, LocTy Loc);
 
       /// SetInstName - After an instruction is parsed and inserted into its
       /// basic block, this installs its name.
@@ -269,14 +272,14 @@ namespace llvm {
       BasicBlock *DefineBB(const std::string &Name, LocTy Loc);
     };
 
-    bool ConvertValIDToValue(const Type *Ty, ValID &ID, Value *&V,
+    bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
                              PerFunctionState *PFS);
 
-    bool ParseValue(const Type *Ty, Value *&V, PerFunctionState *PFS);
-    bool ParseValue(const Type *Ty, Value *&V, PerFunctionState &PFS) {
+    bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
+    bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) {
       return ParseValue(Ty, V, &PFS);
     }
-    bool ParseValue(const Type *Ty, Value *&V, LocTy &Loc,
+    bool ParseValue(Type *Ty, Value *&V, LocTy &Loc,
                     PerFunctionState &PFS) {
       Loc = Lex.getLoc();
       return ParseValue(Ty, V, &PFS);
@@ -310,7 +313,7 @@ namespace llvm {
 
     // Constant Parsing.
     bool ParseValID(ValID &ID, PerFunctionState *PFS = NULL);
-    bool ParseGlobalValue(const Type *Ty, Constant *&V);
+    bool ParseGlobalValue(Type *Ty, Constant *&V);
     bool ParseGlobalTypeAndValue(Constant *&V);
     bool ParseGlobalValueVector(SmallVectorImpl<Constant*> &Elts);
     bool ParseMetadataListValue(ValID &ID, PerFunctionState *PFS);
@@ -344,6 +347,7 @@ namespace llvm {
     bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseResume(Instruction *&Inst, PerFunctionState &PFS);
 
     bool ParseArithmetic(Instruction *&I, PerFunctionState &PFS, unsigned Opc,
                          unsigned OperandType);
@@ -356,10 +360,14 @@ namespace llvm {
     bool ParseInsertElement(Instruction *&I, PerFunctionState &PFS);
     bool ParseShuffleVector(Instruction *&I, PerFunctionState &PFS);
     int ParsePHI(Instruction *&I, PerFunctionState &PFS);
+    bool ParseLandingPad(Instruction *&I, PerFunctionState &PFS);
     bool ParseCall(Instruction *&I, PerFunctionState &PFS, bool isTail);
     int ParseAlloc(Instruction *&I, PerFunctionState &PFS);
     int ParseLoad(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
     int ParseStore(Instruction *&I, PerFunctionState &PFS, bool isVolatile);
+    int ParseCmpXchg(Instruction *&I, PerFunctionState &PFS);
+    int ParseAtomicRMW(Instruction *&I, PerFunctionState &PFS);
+    int ParseFence(Instruction *&I, PerFunctionState &PFS);
     int ParseGetElementPtr(Instruction *&I, PerFunctionState &PFS);
     int ParseExtractValue(Instruction *&I, PerFunctionState &PFS);
     int ParseInsertValue(Instruction *&I, PerFunctionState &PFS);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index a5f89fc..8f16772 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -53,6 +53,9 @@ namespace lltok {
     kw_deplibs,
     kw_datalayout,
     kw_volatile,
+    kw_atomic,
+    kw_unordered, kw_monotonic, kw_acquire, kw_release, kw_acq_rel, kw_seq_cst,
+    kw_singlethread,
     kw_nuw,
     kw_nsw,
     kw_exact,
@@ -87,6 +90,7 @@ namespace lltok {
     kw_readnone,
     kw_readonly,
     kw_uwtable,
+    kw_returns_twice,
 
     kw_inlinehint,
     kw_noinline,
@@ -97,7 +101,6 @@ namespace lltok {
     kw_noredzone,
     kw_noimplicitfloat,
     kw_naked,
-    kw_hotpatch,
     kw_nonlazybind,
 
     kw_type,
@@ -107,6 +110,9 @@ namespace lltok {
     kw_uge, kw_oeq, kw_one, kw_olt, kw_ogt, kw_ole, kw_oge, kw_ord, kw_uno,
     kw_ueq, kw_une,
 
+    // atomicrmw operations that aren't also instruction keywords.
+    kw_xchg, kw_nand, kw_max, kw_min, kw_umax, kw_umin,
+
     // Instruction Opcodes (Opcode in UIntVal).
     kw_add,  kw_fadd, kw_sub,  kw_fsub, kw_mul,  kw_fmul,
     kw_udiv, kw_sdiv, kw_fdiv,
@@ -118,10 +124,13 @@ namespace lltok {
     kw_fptoui, kw_fptosi, kw_inttoptr, kw_ptrtoint, kw_bitcast,
     kw_select, kw_va_arg,
 
-    kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_unwind,
+    kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter,
+
+    kw_ret, kw_br, kw_switch, kw_indirectbr, kw_invoke, kw_unwind, kw_resume,
     kw_unreachable,
 
-    kw_alloca, kw_load, kw_store, kw_getelementptr,
+    kw_alloca, kw_load, kw_store, kw_fence, kw_cmpxchg, kw_atomicrmw,
+    kw_getelementptr,
 
     kw_extractelement, kw_insertelement, kw_shufflevector,
     kw_extractvalue, kw_insertvalue, kw_blockaddress,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 24c2994..46565f3 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -107,7 +107,7 @@ static int GetDecodedCastOpcode(unsigned Val) {
   case bitc::CAST_BITCAST : return Instruction::BitCast;
   }
 }
-static int GetDecodedBinaryOpcode(unsigned Val, const Type *Ty) {
+static int GetDecodedBinaryOpcode(unsigned Val, Type *Ty) {
   switch (Val) {
   default: return -1;
   case bitc::BINOP_ADD:
@@ -131,6 +131,44 @@ static int GetDecodedBinaryOpcode(unsigned Val, const Type *Ty) {
   }
 }
 
+static AtomicRMWInst::BinOp GetDecodedRMWOperation(unsigned Val) {
+  switch (Val) {
+  default: return AtomicRMWInst::BAD_BINOP;
+  case bitc::RMW_XCHG: return AtomicRMWInst::Xchg;
+  case bitc::RMW_ADD: return AtomicRMWInst::Add;
+  case bitc::RMW_SUB: return AtomicRMWInst::Sub;
+  case bitc::RMW_AND: return AtomicRMWInst::And;
+  case bitc::RMW_NAND: return AtomicRMWInst::Nand;
+  case bitc::RMW_OR: return AtomicRMWInst::Or;
+  case bitc::RMW_XOR: return AtomicRMWInst::Xor;
+  case bitc::RMW_MAX: return AtomicRMWInst::Max;
+  case bitc::RMW_MIN: return AtomicRMWInst::Min;
+  case bitc::RMW_UMAX: return AtomicRMWInst::UMax;
+  case bitc::RMW_UMIN: return AtomicRMWInst::UMin;
+  }
+}
+
+static AtomicOrdering GetDecodedOrdering(unsigned Val) {
+  switch (Val) {
+  case bitc::ORDERING_NOTATOMIC: return NotAtomic;
+  case bitc::ORDERING_UNORDERED: return Unordered;
+  case bitc::ORDERING_MONOTONIC: return Monotonic;
+  case bitc::ORDERING_ACQUIRE: return Acquire;
+  case bitc::ORDERING_RELEASE: return Release;
+  case bitc::ORDERING_ACQREL: return AcquireRelease;
+  default: // Map unknown orderings to sequentially-consistent.
+  case bitc::ORDERING_SEQCST: return SequentiallyConsistent;
+  }
+}
+
+static SynchronizationScope GetDecodedSynchScope(unsigned Val) {
+  switch (Val) {
+  case bitc::SYNCHSCOPE_SINGLETHREAD: return SingleThread;
+  default: // Map unknown scopes to cross-thread.
+  case bitc::SYNCHSCOPE_CROSSTHREAD: return CrossThread;
+  }
+}
+
 namespace llvm {
 namespace {
   /// @brief A class for maintaining the slot number definition
@@ -142,7 +180,7 @@ namespace {
     void *operator new(size_t s) {
       return User::operator new(s, 1);
     }
-    explicit ConstantPlaceHolder(const Type *Ty, LLVMContext& Context)
+    explicit ConstantPlaceHolder(Type *Ty, LLVMContext& Context)
       : ConstantExpr(Ty, Instruction::UserOp1, &Op<0>(), 1) {
       Op<0>() = UndefValue::get(Type::getInt32Ty(Context));
     }
@@ -198,7 +236,7 @@ void BitcodeReaderValueList::AssignValue(Value *V, unsigned Idx) {
 
 
 Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx,
-                                                    const Type *Ty) {
+                                                    Type *Ty) {
   if (Idx >= size())
     resize(Idx + 1);
 
@@ -213,7 +251,7 @@ Constant *BitcodeReaderValueList::getConstantFwdRef(unsigned Idx,
   return C;
 }
 
-Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, const Type *Ty) {
+Value *BitcodeReaderValueList::getValueFwdRef(unsigned Idx, Type *Ty) {
   if (Idx >= size())
     resize(Idx + 1);
 
@@ -362,7 +400,7 @@ Type *BitcodeReader::getTypeByID(unsigned ID) {
 
   // If we have a forward reference, the only possible case is when it is to a
   // named struct.  Just create a placeholder for now.
-  return TypeList[ID] = StructType::createNamed(Context, "");
+  return TypeList[ID] = StructType::create(Context);
 }
 
 /// FIXME: Remove in LLVM 3.1, only used by ParseOldTypeTable.
@@ -630,7 +668,7 @@ bool BitcodeReader::ParseTypeTableBody() {
         Res->setName(TypeName);
         TypeList[NumRecords] = 0;
       } else  // Otherwise, create a new struct.
-        Res = StructType::createNamed(Context, TypeName);
+        Res = StructType::create(Context, TypeName);
       TypeName.clear();
       
       SmallVector<Type*, 8> EltTys;
@@ -659,7 +697,7 @@ bool BitcodeReader::ParseTypeTableBody() {
         Res->setName(TypeName);
         TypeList[NumRecords] = 0;
       } else  // Otherwise, create a new struct with no body.
-        Res = StructType::createNamed(Context, TypeName);
+        Res = StructType::create(Context, TypeName);
       TypeName.clear();
       ResultTy = Res;
       break;
@@ -793,7 +831,7 @@ RestartScan:
       break;
     case bitc::TYPE_CODE_OPAQUE:    // OPAQUE
       if (NextTypeID < TypeList.size() && TypeList[NextTypeID] == 0)
-        ResultTy = StructType::createNamed(Context, "");
+        ResultTy = StructType::create(Context);
       break;
     case bitc::TYPE_CODE_STRUCT_OLD: {// STRUCT_OLD
       if (NextTypeID >= TypeList.size()) break;
@@ -804,7 +842,7 @@ RestartScan:
 
       // Set a type.
       if (TypeList[NextTypeID] == 0)
-        TypeList[NextTypeID] = StructType::createNamed(Context, "");
+        TypeList[NextTypeID] = StructType::create(Context);
 
       std::vector<Type*> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
@@ -923,7 +961,7 @@ bool BitcodeReader::ParseOldTypeSymbolTable() {
 
       // Only apply the type name to a struct type with no name.
       if (StructType *STy = dyn_cast<StructType>(TypeList[TypeID]))
-        if (!STy->isAnonymous() && !STy->hasName())
+        if (!STy->isLiteral() && !STy->hasName())
           STy->setName(TypeName);
       TypeName.clear();
       break;
@@ -1063,7 +1101,7 @@ bool BitcodeReader::ParseMetadata() {
       unsigned Size = Record.size();
       SmallVector<Value*, 8> Elts;
       for (unsigned i = 0; i != Size; i += 2) {
-        const Type *Ty = getTypeByID(Record[i]);
+        Type *Ty = getTypeByID(Record[i]);
         if (!Ty) return Error("Invalid METADATA_NODE record");
         if (Ty->isMetadataTy())
           Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
@@ -1163,7 +1201,7 @@ bool BitcodeReader::ParseConstants() {
   SmallVector<uint64_t, 64> Record;
 
   // Read all the records for this value table.
-  const Type *CurTy = Type::getInt32Ty(Context);
+  Type *CurTy = Type::getInt32Ty(Context);
   unsigned NextCstNo = ValueList.size();
   while (1) {
     unsigned Code = Stream.ReadCode();
@@ -1218,7 +1256,7 @@ bool BitcodeReader::ParseConstants() {
         Words[i] = DecodeSignRotatedValue(Record[i]);
       V = ConstantInt::get(Context,
                            APInt(cast<IntegerType>(CurTy)->getBitWidth(),
-                           NumWords, &Words[0]));
+                                 Words));
       break;
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
@@ -1233,11 +1271,11 @@ bool BitcodeReader::ParseConstants() {
         uint64_t Rearrange[2];
         Rearrange[0] = (Record[1] & 0xffffLL) | (Record[0] << 16);
         Rearrange[1] = Record[0] >> 48;
-        V = ConstantFP::get(Context, APFloat(APInt(80, 2, Rearrange)));
+        V = ConstantFP::get(Context, APFloat(APInt(80, Rearrange)));
       } else if (CurTy->isFP128Ty())
-        V = ConstantFP::get(Context, APFloat(APInt(128, 2, &Record[0]), true));
+        V = ConstantFP::get(Context, APFloat(APInt(128, Record), true));
       else if (CurTy->isPPC_FP128Ty())
-        V = ConstantFP::get(Context, APFloat(APInt(128, 2, &Record[0])));
+        V = ConstantFP::get(Context, APFloat(APInt(128, Record)));
       else
         V = UndefValue::get(CurTy);
       break;
@@ -1250,18 +1288,18 @@ bool BitcodeReader::ParseConstants() {
       unsigned Size = Record.size();
       std::vector<Constant*> Elts;
 
-      if (const StructType *STy = dyn_cast<StructType>(CurTy)) {
+      if (StructType *STy = dyn_cast<StructType>(CurTy)) {
         for (unsigned i = 0; i != Size; ++i)
           Elts.push_back(ValueList.getConstantFwdRef(Record[i],
                                                      STy->getElementType(i)));
         V = ConstantStruct::get(STy, Elts);
-      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
-        const Type *EltTy = ATy->getElementType();
+      } else if (ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
+        Type *EltTy = ATy->getElementType();
         for (unsigned i = 0; i != Size; ++i)
           Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
         V = ConstantArray::get(ATy, Elts);
-      } else if (const VectorType *VTy = dyn_cast<VectorType>(CurTy)) {
-        const Type *EltTy = VTy->getElementType();
+      } else if (VectorType *VTy = dyn_cast<VectorType>(CurTy)) {
+        Type *EltTy = VTy->getElementType();
         for (unsigned i = 0; i != Size; ++i)
           Elts.push_back(ValueList.getConstantFwdRef(Record[i], EltTy));
         V = ConstantVector::get(Elts);
@@ -1274,8 +1312,8 @@ bool BitcodeReader::ParseConstants() {
       if (Record.empty())
         return Error("Invalid CST_AGGREGATE record");
 
-      const ArrayType *ATy = cast<ArrayType>(CurTy);
-      const Type *EltTy = ATy->getElementType();
+      ArrayType *ATy = cast<ArrayType>(CurTy);
+      Type *EltTy = ATy->getElementType();
 
       unsigned Size = Record.size();
       std::vector<Constant*> Elts;
@@ -1288,8 +1326,8 @@ bool BitcodeReader::ParseConstants() {
       if (Record.empty())
         return Error("Invalid CST_AGGREGATE record");
 
-      const ArrayType *ATy = cast<ArrayType>(CurTy);
-      const Type *EltTy = ATy->getElementType();
+      ArrayType *ATy = cast<ArrayType>(CurTy);
+      Type *EltTy = ATy->getElementType();
 
       unsigned Size = Record.size();
       std::vector<Constant*> Elts;
@@ -1335,7 +1373,7 @@ bool BitcodeReader::ParseConstants() {
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
-        const Type *OpTy = getTypeByID(Record[1]);
+        Type *OpTy = getTypeByID(Record[1]);
         if (!OpTy) return Error("Invalid CE_CAST record");
         Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
         V = ConstantExpr::getCast(Opc, Op, CurTy);
@@ -1347,16 +1385,14 @@ bool BitcodeReader::ParseConstants() {
       if (Record.size() & 1) return Error("Invalid CE_GEP record");
       SmallVector<Constant*, 16> Elts;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        const Type *ElTy = getTypeByID(Record[i]);
+        Type *ElTy = getTypeByID(Record[i]);
         if (!ElTy) return Error("Invalid CE_GEP record");
         Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
       }
-      if (BitCode == bitc::CST_CODE_CE_INBOUNDS_GEP)
-        V = ConstantExpr::getInBoundsGetElementPtr(Elts[0], &Elts[1],
-                                                   Elts.size()-1);
-      else
-        V = ConstantExpr::getGetElementPtr(Elts[0], &Elts[1],
-                                           Elts.size()-1);
+      ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
+      V = ConstantExpr::getGetElementPtr(Elts[0], Indices,
+                                         BitCode ==
+                                           bitc::CST_CODE_CE_INBOUNDS_GEP);
       break;
     }
     case bitc::CST_CODE_CE_SELECT:  // CE_SELECT: [opval#, opval#, opval#]
@@ -1368,7 +1404,7 @@ bool BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opval]
       if (Record.size() < 3) return Error("Invalid CE_EXTRACTELT record");
-      const VectorType *OpTy =
+      VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (OpTy == 0) return Error("Invalid CE_EXTRACTELT record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
@@ -1377,7 +1413,7 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opval]
-      const VectorType *OpTy = dyn_cast<VectorType>(CurTy);
+      VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
         return Error("Invalid CE_INSERTELT record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
@@ -1388,26 +1424,26 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
-      const VectorType *OpTy = dyn_cast<VectorType>(CurTy);
+      VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
         return Error("Invalid CE_SHUFFLEVEC record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      const Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
+      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
                                                  OpTy->getNumElements());
       Constant *Op2 = ValueList.getConstantFwdRef(Record[2], ShufTy);
       V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
       break;
     }
     case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval]
-      const VectorType *RTy = dyn_cast<VectorType>(CurTy);
-      const VectorType *OpTy =
+      VectorType *RTy = dyn_cast<VectorType>(CurTy);
+      VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || RTy == 0 || OpTy == 0)
         return Error("Invalid CE_SHUFVEC_EX record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
-      const Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
+      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
                                                  RTy->getNumElements());
       Constant *Op2 = ValueList.getConstantFwdRef(Record[3], ShufTy);
       V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
@@ -1415,7 +1451,7 @@ bool BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
       if (Record.size() < 4) return Error("Invalid CE_CMP record");
-      const Type *OpTy = getTypeByID(Record[0]);
+      Type *OpTy = getTypeByID(Record[0]);
       if (OpTy == 0) return Error("Invalid CE_CMP record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
@@ -1442,14 +1478,14 @@ bool BitcodeReader::ParseConstants() {
         AsmStr += (char)Record[2+i];
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[3+AsmStrSize+i];
-      const PointerType *PTy = cast<PointerType>(CurTy);
+      PointerType *PTy = cast<PointerType>(CurTy);
       V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
                          AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
       break;
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
       if (Record.size() < 3) return Error("Invalid CE_BLOCKADDRESS record");
-      const Type *FnTy = getTypeByID(Record[0]);
+      Type *FnTy = getTypeByID(Record[0]);
       if (FnTy == 0) return Error("Invalid CE_BLOCKADDRESS record");
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
@@ -1662,7 +1698,7 @@ bool BitcodeReader::ParseModule() {
     case bitc::MODULE_CODE_GLOBALVAR: {
       if (Record.size() < 6)
         return Error("Invalid MODULE_CODE_GLOBALVAR record");
-      const Type *Ty = getTypeByID(Record[0]);
+      Type *Ty = getTypeByID(Record[0]);
       if (!Ty) return Error("Invalid MODULE_CODE_GLOBALVAR record");
       if (!Ty->isPointerTy())
         return Error("Global not a pointer type!");
@@ -1711,11 +1747,11 @@ bool BitcodeReader::ParseModule() {
     case bitc::MODULE_CODE_FUNCTION: {
       if (Record.size() < 8)
         return Error("Invalid MODULE_CODE_FUNCTION record");
-      const Type *Ty = getTypeByID(Record[0]);
+      Type *Ty = getTypeByID(Record[0]);
       if (!Ty) return Error("Invalid MODULE_CODE_FUNCTION record");
       if (!Ty->isPointerTy())
         return Error("Function not a pointer type!");
-      const FunctionType *FTy =
+      FunctionType *FTy =
         dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
       if (!FTy)
         return Error("Function not a pointer to function type!");
@@ -1757,7 +1793,7 @@ bool BitcodeReader::ParseModule() {
     case bitc::MODULE_CODE_ALIAS: {
       if (Record.size() < 3)
         return Error("Invalid MODULE_ALIAS record");
-      const Type *Ty = getTypeByID(Record[0]);
+      Type *Ty = getTypeByID(Record[0]);
       if (!Ty) return Error("Invalid MODULE_ALIAS record");
       if (!Ty->isPointerTy())
         return Error("Function not a pointer type!");
@@ -1823,9 +1859,9 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
 
     if (Code != bitc::ENTER_SUBBLOCK) {
 
-      // The ranlib in xcode 4 will align archive members by appending newlines to the
-      // end of them. If this file size is a multiple of 4 but not 8, we have to read and
-      // ignore these final 4 bytes :-(
+      // The ranlib in xcode 4 will align archive members by appending newlines
+      // to the end of them. If this file size is a multiple of 4 but not 8, we
+      // have to read and ignore these final 4 bytes :-(
       if (Stream.GetAbbrevIDWidth() == 2 && Code == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
 	  Stream.AtEndOfStream())
@@ -2160,7 +2196,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           OpNum+2 != Record.size())
         return Error("Invalid CAST record");
 
-      const Type *ResTy = getTypeByID(Record[OpNum]);
+      Type *ResTy = getTypeByID(Record[OpNum]);
       int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
       if (Opc == -1 || ResTy == 0)
         return Error("Invalid CAST record");
@@ -2183,7 +2219,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         GEPIdx.push_back(Op);
       }
 
-      I = GetElementPtrInst::Create(BasePtr, GEPIdx.begin(), GEPIdx.end());
+      I = GetElementPtrInst::Create(BasePtr, GEPIdx);
       InstructionList.push_back(I);
       if (BitCode == bitc::FUNC_CODE_INST_INBOUNDS_GEP)
         cast<GetElementPtrInst>(I)->setIsInBounds(true);
@@ -2261,8 +2297,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         return Error("Invalid SELECT record");
 
       // select condition can be either i1 or [N x i1]
-      if (const VectorType* vector_type =
-          dyn_cast<const VectorType>(Cond->getType())) {
+      if (VectorType* vector_type =
+          dyn_cast<VectorType>(Cond->getType())) {
         // expect <n x i1>
         if (vector_type->getElementType() != Type::getInt1Ty(Context))
           return Error("Invalid SELECT condition type");
@@ -2381,7 +2417,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
       if (Record.size() < 3 || (Record.size() & 1) == 0)
         return Error("Invalid SWITCH record");
-      const Type *OpTy = getTypeByID(Record[0]);
+      Type *OpTy = getTypeByID(Record[0]);
       Value *Cond = getFnValueByID(Record[1], OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (OpTy == 0 || Cond == 0 || Default == 0)
@@ -2405,7 +2441,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
         return Error("Invalid INDIRECTBR record");
-      const Type *OpTy = getTypeByID(Record[0]);
+      Type *OpTy = getTypeByID(Record[0]);
       Value *Address = getFnValueByID(Record[1], OpTy);
       if (OpTy == 0 || Address == 0)
         return Error("Invalid INDIRECTBR record");
@@ -2437,8 +2473,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
         return Error("Invalid INVOKE record");
 
-      const PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
-      const FunctionType *FTy = !CalleeTy ? 0 :
+      PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
+      FunctionType *FTy = !CalleeTy ? 0 :
         dyn_cast<FunctionType>(CalleeTy->getElementType());
 
       // Check that the right number of fixed parameters are here.
@@ -2472,6 +2508,15 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       cast<InvokeInst>(I)->setAttributes(PAL);
       break;
     }
+    case bitc::FUNC_CODE_INST_RESUME: { // RESUME: [opval]
+      unsigned Idx = 0;
+      Value *Val = 0;
+      if (getValueTypePair(Record, Idx, NextValueNo, Val))
+        return Error("Invalid RESUME record");
+      I = ResumeInst::Create(Val);
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_UNWIND: // UNWIND
       I = new UnwindInst(Context);
       InstructionList.push_back(I);
@@ -2483,7 +2528,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
         return Error("Invalid PHI record");
-      const Type *Ty = getTypeByID(Record[0]);
+      Type *Ty = getTypeByID(Record[0]);
       if (!Ty) return Error("Invalid PHI record");
 
       PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
@@ -2499,12 +2544,51 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       break;
     }
 
+    case bitc::FUNC_CODE_INST_LANDINGPAD: {
+      // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?]
+      unsigned Idx = 0;
+      if (Record.size() < 4)
+        return Error("Invalid LANDINGPAD record");
+      Type *Ty = getTypeByID(Record[Idx++]);
+      if (!Ty) return Error("Invalid LANDINGPAD record");
+      Value *PersFn = 0;
+      if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
+        return Error("Invalid LANDINGPAD record");
+
+      bool IsCleanup = !!Record[Idx++];
+      unsigned NumClauses = Record[Idx++];
+      LandingPadInst *LP = LandingPadInst::Create(Ty, PersFn, NumClauses);
+      LP->setCleanup(IsCleanup);
+      for (unsigned J = 0; J != NumClauses; ++J) {
+        LandingPadInst::ClauseType CT =
+          LandingPadInst::ClauseType(Record[Idx++]); (void)CT;
+        Value *Val;
+
+        if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
+          delete LP;
+          return Error("Invalid LANDINGPAD record");
+        }
+
+        assert((CT != LandingPadInst::Catch ||
+                !isa<ArrayType>(Val->getType())) &&
+               "Catch clause has a invalid type!");
+        assert((CT != LandingPadInst::Filter ||
+                isa<ArrayType>(Val->getType())) &&
+               "Filter clause has invalid type!");
+        LP->addClause(Val);
+      }
+
+      I = LP;
+      InstructionList.push_back(I);
+      break;
+    }
+
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       if (Record.size() != 4)
         return Error("Invalid ALLOCA record");
-      const PointerType *Ty =
+      PointerType *Ty =
         dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
-      const Type *OpTy = getTypeByID(Record[1]);
+      Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
       unsigned Align = Record[3];
       if (!Ty || !Size) return Error("Invalid ALLOCA record");
@@ -2523,6 +2607,28 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
+    case bitc::FUNC_CODE_INST_LOADATOMIC: {
+       // LOADATOMIC: [opty, op, align, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Op;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
+          OpNum+4 != Record.size())
+        return Error("Invalid LOADATOMIC record");
+        
+
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
+      if (Ordering == NotAtomic || Ordering == Release ||
+          Ordering == AcquireRelease)
+        return Error("Invalid LOADATOMIC record");
+      if (Ordering != NotAtomic && Record[OpNum] == 0)
+        return Error("Invalid LOADATOMIC record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
+
+      I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1,
+                       Ordering, SynchScope);
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
@@ -2536,6 +2642,83 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     }
+    case bitc::FUNC_CODE_INST_STOREATOMIC: {
+      // STOREATOMIC: [ptrty, ptr, val, align, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Val, *Ptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          getValue(Record, OpNum,
+                    cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
+          OpNum+4 != Record.size())
+        return Error("Invalid STOREATOMIC record");
+
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
+      if (Ordering == NotAtomic || Ordering == Acquire ||
+          Ordering == AcquireRelease)
+        return Error("Invalid STOREATOMIC record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
+      if (Ordering != NotAtomic && Record[OpNum] == 0)
+        return Error("Invalid STOREATOMIC record");
+
+      I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1,
+                        Ordering, SynchScope);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_CMPXCHG: {
+      // CMPXCHG:[ptrty, ptr, cmp, new, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Ptr, *Cmp, *New;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          getValue(Record, OpNum,
+                    cast<PointerType>(Ptr->getType())->getElementType(), Cmp) ||
+          getValue(Record, OpNum,
+                    cast<PointerType>(Ptr->getType())->getElementType(), New) ||
+          OpNum+3 != Record.size())
+        return Error("Invalid CMPXCHG record");
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+1]);
+      if (Ordering == NotAtomic || Ordering == Unordered)
+        return Error("Invalid CMPXCHG record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]);
+      I = new AtomicCmpXchgInst(Ptr, Cmp, New, Ordering, SynchScope);
+      cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_ATOMICRMW: {
+      // ATOMICRMW:[ptrty, ptr, val, op, vol, ordering, synchscope]
+      unsigned OpNum = 0;
+      Value *Ptr, *Val;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
+          getValue(Record, OpNum,
+                    cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
+          OpNum+4 != Record.size())
+        return Error("Invalid ATOMICRMW record");
+      AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]);
+      if (Operation < AtomicRMWInst::FIRST_BINOP ||
+          Operation > AtomicRMWInst::LAST_BINOP)
+        return Error("Invalid ATOMICRMW record");
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
+      if (Ordering == NotAtomic || Ordering == Unordered)
+        return Error("Invalid ATOMICRMW record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
+      I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
+      cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
+      if (2 != Record.size())
+        return Error("Invalid FENCE record");
+      AtomicOrdering Ordering = GetDecodedOrdering(Record[0]);
+      if (Ordering == NotAtomic || Ordering == Unordered ||
+          Ordering == Monotonic)
+        return Error("Invalid FENCE record");
+      SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]);
+      I = new FenceInst(Context, Ordering, SynchScope);
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
@@ -2549,8 +2732,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
         return Error("Invalid CALL record");
 
-      const PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
-      const FunctionType *FTy = 0;
+      PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
+      FunctionType *FTy = 0;
       if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
       if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
         return Error("Invalid CALL record");
@@ -2589,9 +2772,9 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
         return Error("Invalid VAARG record");
-      const Type *OpTy = getTypeByID(Record[0]);
+      Type *OpTy = getTypeByID(Record[0]);
       Value *Op = getFnValueByID(Record[1], OpTy);
-      const Type *ResTy = getTypeByID(Record[2]);
+      Type *ResTy = getTypeByID(Record[2]);
       if (!OpTy || !Op || !ResTy)
         return Error("Invalid VAARG record");
       I = new VAArgInst(Op, ResTy);
@@ -2756,6 +2939,9 @@ bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
   }
   std::vector<std::pair<Function*, Function*> >().swap(UpgradedIntrinsics);
 
+  // Upgrade to new EH scheme. N.B. This will go away in 3.1.
+  UpgradeExceptionHandling(M);
+
   // Check debug info intrinsics.
   CheckDebugInfoIntrinsics(TheModule);
 
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 1b3bf1a..6e6118c 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -76,8 +76,8 @@ public:
     ValuePtrs.resize(N);
   }
   
-  Constant *getConstantFwdRef(unsigned Idx, const Type *Ty);
-  Value *getValueFwdRef(unsigned Idx, const Type *Ty);
+  Constant *getConstantFwdRef(unsigned Idx, Type *Ty);
+  Value *getValueFwdRef(unsigned Idx, Type *Ty);
   
   void AssignValue(Value *V, unsigned Idx);
   
@@ -212,7 +212,7 @@ public:
 private:
   Type *getTypeByID(unsigned ID);
   Type *getTypeByIDOrNull(unsigned ID);
-  Value *getFnValueByID(unsigned ID, const Type *Ty) {
+  Value *getFnValueByID(unsigned ID, Type *Ty) {
     if (Ty && Ty->isMetadataTy())
       return MDValueList.getValueFwdRef(ID);
     return ValueList.getValueFwdRef(ID, Ty);
@@ -248,7 +248,7 @@ private:
     return ResVal == 0;
   }
   bool getValue(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
-                const Type *Ty, Value *&ResVal) {
+                Type *Ty, Value *&ResVal) {
     if (Slot == Record.size()) return true;
     unsigned ValNo = (unsigned)Record[Slot++];
     ResVal = getFnValueByID(ValNo, Ty);
diff --git a/lib/Bitcode/Reader/CMakeLists.txt b/lib/Bitcode/Reader/CMakeLists.txt
index 693d431..37bebc4 100644
--- a/lib/Bitcode/Reader/CMakeLists.txt
+++ b/lib/Bitcode/Reader/CMakeLists.txt
@@ -2,3 +2,8 @@ add_llvm_library(LLVMBitReader
   BitReader.cpp
   BitcodeReader.cpp
   )
+
+add_llvm_library_dependencies(LLVMBitReader
+  LLVMCore
+  LLVMSupport
+  )
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 85d67ce..5b3d969 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -58,7 +58,6 @@ enum {
   FUNCTION_INST_UNREACHABLE_ABBREV
 };
 
-
 static unsigned GetEncodedCastOpcode(unsigned Opcode) {
   switch (Opcode) {
   default: llvm_unreachable("Unknown cast instruction!");
@@ -101,6 +100,44 @@ static unsigned GetEncodedBinaryOpcode(unsigned Opcode) {
   }
 }
 
+static unsigned GetEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
+  switch (Op) {
+  default: llvm_unreachable("Unknown RMW operation!");
+  case AtomicRMWInst::Xchg: return bitc::RMW_XCHG;
+  case AtomicRMWInst::Add: return bitc::RMW_ADD;
+  case AtomicRMWInst::Sub: return bitc::RMW_SUB;
+  case AtomicRMWInst::And: return bitc::RMW_AND;
+  case AtomicRMWInst::Nand: return bitc::RMW_NAND;
+  case AtomicRMWInst::Or: return bitc::RMW_OR;
+  case AtomicRMWInst::Xor: return bitc::RMW_XOR;
+  case AtomicRMWInst::Max: return bitc::RMW_MAX;
+  case AtomicRMWInst::Min: return bitc::RMW_MIN;
+  case AtomicRMWInst::UMax: return bitc::RMW_UMAX;
+  case AtomicRMWInst::UMin: return bitc::RMW_UMIN;
+  }
+}
+
+static unsigned GetEncodedOrdering(AtomicOrdering Ordering) {
+  switch (Ordering) {
+  default: llvm_unreachable("Unknown atomic ordering");
+  case NotAtomic: return bitc::ORDERING_NOTATOMIC;
+  case Unordered: return bitc::ORDERING_UNORDERED;
+  case Monotonic: return bitc::ORDERING_MONOTONIC;
+  case Acquire: return bitc::ORDERING_ACQUIRE;
+  case Release: return bitc::ORDERING_RELEASE;
+  case AcquireRelease: return bitc::ORDERING_ACQREL;
+  case SequentiallyConsistent: return bitc::ORDERING_SEQCST;
+  }
+}
+
+static unsigned GetEncodedSynchScope(SynchronizationScope SynchScope) {
+  switch (SynchScope) {
+  default: llvm_unreachable("Unknown synchronization scope");
+  case SingleThread: return bitc::SYNCHSCOPE_SINGLETHREAD;
+  case CrossThread: return bitc::SYNCHSCOPE_CROSSTHREAD;
+  }
+}
+
 static void WriteStringRecord(unsigned Code, StringRef Str,
                               unsigned AbbrevToUse, BitstreamWriter &Stream) {
   SmallVector<unsigned, 64> Vals;
@@ -199,7 +236,6 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                             Log2_32_Ceil(VE.getTypes().size()+1)));
   unsigned StructNamedAbbrev = Stream.EmitAbbrev(Abbv);
-
   
   // Abbrev for TYPE_CODE_ARRAY.
   Abbv = new BitCodeAbbrev();
@@ -216,7 +252,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
 
   // Loop over all of the types, emitting each in turn.
   for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
-    const Type *T = TypeList[i];
+    Type *T = TypeList[i];
     int AbbrevToUse = 0;
     unsigned Code = 0;
 
@@ -237,7 +273,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
       TypeVals.push_back(cast<IntegerType>(T)->getBitWidth());
       break;
     case Type::PointerTyID: {
-      const PointerType *PTy = cast<PointerType>(T);
+      PointerType *PTy = cast<PointerType>(T);
       // POINTER: [pointee type, address space]
       Code = bitc::TYPE_CODE_POINTER;
       TypeVals.push_back(VE.getTypeID(PTy->getElementType()));
@@ -247,7 +283,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
       break;
     }
     case Type::FunctionTyID: {
-      const FunctionType *FT = cast<FunctionType>(T);
+      FunctionType *FT = cast<FunctionType>(T);
       // FUNCTION: [isvararg, attrid, retty, paramty x N]
       Code = bitc::TYPE_CODE_FUNCTION;
       TypeVals.push_back(FT->isVarArg());
@@ -259,7 +295,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
       break;
     }
     case Type::StructTyID: {
-      const StructType *ST = cast<StructType>(T);
+      StructType *ST = cast<StructType>(T);
       // STRUCT: [ispacked, eltty x N]
       TypeVals.push_back(ST->isPacked());
       // Output all of the element types.
@@ -267,7 +303,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
            E = ST->element_end(); I != E; ++I)
         TypeVals.push_back(VE.getTypeID(*I));
       
-      if (ST->isAnonymous()) {
+      if (ST->isLiteral()) {
         Code = bitc::TYPE_CODE_STRUCT_ANON;
         AbbrevToUse = StructAnonAbbrev;
       } else {
@@ -286,7 +322,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
       break;
     }
     case Type::ArrayTyID: {
-      const ArrayType *AT = cast<ArrayType>(T);
+      ArrayType *AT = cast<ArrayType>(T);
       // ARRAY: [numelts, eltty]
       Code = bitc::TYPE_CODE_ARRAY;
       TypeVals.push_back(AT->getNumElements());
@@ -295,7 +331,7 @@ static void WriteTypeTable(const ValueEnumerator &VE, BitstreamWriter &Stream) {
       break;
     }
     case Type::VectorTyID: {
-      const VectorType *VT = cast<VectorType>(T);
+      VectorType *VT = cast<VectorType>(T);
       // VECTOR [numelts, eltty]
       Code = bitc::TYPE_CODE_VECTOR;
       TypeVals.push_back(VT->getNumElements());
@@ -372,14 +408,15 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
        GV != E; ++GV) {
     MaxAlignment = std::max(MaxAlignment, GV->getAlignment());
     MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV->getType()));
-
-    if (!GV->hasSection()) continue;
-    // Give section names unique ID's.
-    unsigned &Entry = SectionMap[GV->getSection()];
-    if (Entry != 0) continue;
-    WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV->getSection(),
-                      0/*TODO*/, Stream);
-    Entry = SectionMap.size();
+    if (GV->hasSection()) {
+      // Give section names unique ID's.
+      unsigned &Entry = SectionMap[GV->getSection()];
+      if (!Entry) {
+        WriteStringRecord(bitc::MODULE_CODE_SECTIONNAME, GV->getSection(),
+                          0/*TODO*/, Stream);
+        Entry = SectionMap.size();
+      }
+    }
   }
   for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
     MaxAlignment = std::max(MaxAlignment, F->getAlignment());
@@ -716,7 +753,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
   SmallVector<uint64_t, 64> Record;
 
   const ValueEnumerator::ValueList &Vals = VE.getValues();
-  const Type *LastTy = 0;
+  Type *LastTy = 0;
   for (unsigned i = FirstVal; i != LastVal; ++i) {
     const Value *V = Vals[i].first;
     // If we need to switch types, do so now.
@@ -781,7 +818,7 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
       }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
-      const Type *Ty = CFP->getType();
+      Type *Ty = CFP->getType();
       if (Ty->isFloatTy() || Ty->isDoubleTy()) {
         Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
       } else if (Ty->isX86_FP80Ty()) {
@@ -1083,8 +1120,8 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::Invoke: {
     const InvokeInst *II = cast<InvokeInst>(&I);
     const Value *Callee(II->getCalledValue());
-    const PointerType *PTy = cast<PointerType>(Callee->getType());
-    const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+    PointerType *PTy = cast<PointerType>(Callee->getType());
+    FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
     Code = bitc::FUNC_CODE_INST_INVOKE;
 
     Vals.push_back(VE.getAttributeID(II->getAttributes()));
@@ -1105,6 +1142,10 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     }
     break;
   }
+  case Instruction::Resume:
+    Code = bitc::FUNC_CODE_INST_RESUME;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    break;
   case Instruction::Unwind:
     Code = bitc::FUNC_CODE_INST_UNWIND;
     break;
@@ -1124,6 +1165,23 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     break;
   }
 
+  case Instruction::LandingPad: {
+    const LandingPadInst &LP = cast<LandingPadInst>(I);
+    Code = bitc::FUNC_CODE_INST_LANDINGPAD;
+    Vals.push_back(VE.getTypeID(LP.getType()));
+    PushValueAndType(LP.getPersonalityFn(), InstID, Vals, VE);
+    Vals.push_back(LP.isCleanup());
+    Vals.push_back(LP.getNumClauses());
+    for (unsigned I = 0, E = LP.getNumClauses(); I != E; ++I) {
+      if (LP.isCatch(I))
+        Vals.push_back(LandingPadInst::Catch);
+      else
+        Vals.push_back(LandingPadInst::Filter);
+      PushValueAndType(LP.getClause(I), InstID, Vals, VE);
+    }
+    break;
+  }
+
   case Instruction::Alloca:
     Code = bitc::FUNC_CODE_INST_ALLOCA;
     Vals.push_back(VE.getTypeID(I.getType()));
@@ -1133,24 +1191,66 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     break;
 
   case Instruction::Load:
-    Code = bitc::FUNC_CODE_INST_LOAD;
-    if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))  // ptr
-      AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
-
+    if (cast<LoadInst>(I).isAtomic()) {
+      Code = bitc::FUNC_CODE_INST_LOADATOMIC;
+      PushValueAndType(I.getOperand(0), InstID, Vals, VE);
+    } else {
+      Code = bitc::FUNC_CODE_INST_LOAD;
+      if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))  // ptr
+        AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
+    }
     Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1);
     Vals.push_back(cast<LoadInst>(I).isVolatile());
+    if (cast<LoadInst>(I).isAtomic()) {
+      Vals.push_back(GetEncodedOrdering(cast<LoadInst>(I).getOrdering()));
+      Vals.push_back(GetEncodedSynchScope(cast<LoadInst>(I).getSynchScope()));
+    }
     break;
   case Instruction::Store:
-    Code = bitc::FUNC_CODE_INST_STORE;
+    if (cast<StoreInst>(I).isAtomic())
+      Code = bitc::FUNC_CODE_INST_STOREATOMIC;
+    else
+      Code = bitc::FUNC_CODE_INST_STORE;
     PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
     Vals.push_back(VE.getValueID(I.getOperand(0)));       // val.
     Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
     Vals.push_back(cast<StoreInst>(I).isVolatile());
+    if (cast<StoreInst>(I).isAtomic()) {
+      Vals.push_back(GetEncodedOrdering(cast<StoreInst>(I).getOrdering()));
+      Vals.push_back(GetEncodedSynchScope(cast<StoreInst>(I).getSynchScope()));
+    }
+    break;
+  case Instruction::AtomicCmpXchg:
+    Code = bitc::FUNC_CODE_INST_CMPXCHG;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
+    Vals.push_back(VE.getValueID(I.getOperand(1)));       // cmp.
+    Vals.push_back(VE.getValueID(I.getOperand(2)));       // newval.
+    Vals.push_back(cast<AtomicCmpXchgInst>(I).isVolatile());
+    Vals.push_back(GetEncodedOrdering(
+                     cast<AtomicCmpXchgInst>(I).getOrdering()));
+    Vals.push_back(GetEncodedSynchScope(
+                     cast<AtomicCmpXchgInst>(I).getSynchScope()));
+    break;
+  case Instruction::AtomicRMW:
+    Code = bitc::FUNC_CODE_INST_ATOMICRMW;
+    PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
+    Vals.push_back(VE.getValueID(I.getOperand(1)));       // val.
+    Vals.push_back(GetEncodedRMWOperation(
+                     cast<AtomicRMWInst>(I).getOperation()));
+    Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
+    Vals.push_back(GetEncodedOrdering(cast<AtomicRMWInst>(I).getOrdering()));
+    Vals.push_back(GetEncodedSynchScope(
+                     cast<AtomicRMWInst>(I).getSynchScope()));
+    break;
+  case Instruction::Fence:
+    Code = bitc::FUNC_CODE_INST_FENCE;
+    Vals.push_back(GetEncodedOrdering(cast<FenceInst>(I).getOrdering()));
+    Vals.push_back(GetEncodedSynchScope(cast<FenceInst>(I).getSynchScope()));
     break;
   case Instruction::Call: {
     const CallInst &CI = cast<CallInst>(I);
-    const PointerType *PTy = cast<PointerType>(CI.getCalledValue()->getType());
-    const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+    PointerType *PTy = cast<PointerType>(CI.getCalledValue()->getType());
+    FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
 
     Code = bitc::FUNC_CODE_INST_CALL;
 
diff --git a/lib/Bitcode/Writer/CMakeLists.txt b/lib/Bitcode/Writer/CMakeLists.txt
index f097b09..3cf9056 100644
--- a/lib/Bitcode/Writer/CMakeLists.txt
+++ b/lib/Bitcode/Writer/CMakeLists.txt
@@ -4,3 +4,8 @@ add_llvm_library(LLVMBitWriter
   BitcodeWriterPass.cpp
   ValueEnumerator.cpp
   )
+
+add_llvm_library_dependencies(LLVMBitWriter
+  LLVMCore
+  LLVMSupport
+  )
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index b68bf92..9ae9905 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -315,7 +315,7 @@ void ValueEnumerator::EnumerateValue(const Value *V) {
 }
 
 
-void ValueEnumerator::EnumerateType(const Type *Ty) {
+void ValueEnumerator::EnumerateType(Type *Ty) {
   unsigned *TypeID = &TypeMap[Ty];
 
   // We've already seen this type.
@@ -325,8 +325,8 @@ void ValueEnumerator::EnumerateType(const Type *Ty) {
   // If it is a non-anonymous struct, mark the type as being visited so that we
   // don't recursively visit it.  This is safe because we allow forward
   // references of these in the bitcode reader.
-  if (const StructType *STy = dyn_cast<StructType>(Ty))
-    if (!STy->isAnonymous())
+  if (StructType *STy = dyn_cast<StructType>(Ty))
+    if (!STy->isLiteral())
       *TypeID = ~0U;
   
   // Enumerate all of the subtypes before we enumerate this type.  This ensures
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 6617b60..b6fc920 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -35,12 +35,12 @@ class MDSymbolTable;
 
 class ValueEnumerator {
 public:
-  typedef std::vector<const Type*> TypeList;
+  typedef std::vector<Type*> TypeList;
 
   // For each value, we remember its Value* and occurrence frequency.
   typedef std::vector<std::pair<const Value*, unsigned> > ValueList;
 private:
-  typedef DenseMap<const Type*, unsigned> TypeMapType;
+  typedef DenseMap<Type*, unsigned> TypeMapType;
   TypeMapType TypeMap;
   TypeList Types;
 
@@ -85,7 +85,7 @@ public:
 
   unsigned getValueID(const Value *V) const;
 
-  unsigned getTypeID(const Type *T) const {
+  unsigned getTypeID(Type *T) const {
     TypeMapType::const_iterator I = TypeMap.find(T);
     assert(I != TypeMap.end() && "Type not in ValueEnumerator!");
     return I->second-1;
@@ -140,7 +140,7 @@ private:
   void EnumerateFunctionLocalMetadata(const MDNode *N);
   void EnumerateNamedMDNode(const NamedMDNode *NMD);
   void EnumerateValue(const Value *V);
-  void EnumerateType(const Type *T);
+  void EnumerateType(Type *T);
   void EnumerateOperandType(const Value *V);
   void EnumerateAttributes(const AttrListPtr &PAL);
   
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 80118f0..fb63c63 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,4 +1,4 @@
-# `Support' library is added on the top-level CMakeLists.txt
+# `Support' and `TableGen' libraries are added on the top-level CMakeLists.txt
 
 add_subdirectory(VMCore)
 add_subdirectory(CodeGen)
@@ -7,8 +7,8 @@ add_subdirectory(Transforms)
 add_subdirectory(Linker)
 add_subdirectory(Analysis)
 add_subdirectory(MC)
-add_subdirectory(CompilerDriver)
 add_subdirectory(Object)
+add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(Target)
 add_subdirectory(AsmParser)
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 125e641..fafc010 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 /// of insertvalue or extractvalue indices that identify a member, return
 /// the linearized index of the start of the member.
 ///
-unsigned llvm::ComputeLinearIndex(const Type *Ty,
+unsigned llvm::ComputeLinearIndex(Type *Ty,
                                   const unsigned *Indices,
                                   const unsigned *IndicesEnd,
                                   unsigned CurIndex) {
@@ -40,7 +40,7 @@ unsigned llvm::ComputeLinearIndex(const Type *Ty,
     return CurIndex;
 
   // Given a struct type, recursively traverse the elements.
-  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (StructType::element_iterator EB = STy->element_begin(),
                                       EI = EB,
                                       EE = STy->element_end();
@@ -52,8 +52,8 @@ unsigned llvm::ComputeLinearIndex(const Type *Ty,
     return CurIndex;
   }
   // Given an array type, recursively traverse the elements.
-  else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    const Type *EltTy = ATy->getElementType();
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
       if (Indices && *Indices == i)
         return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
@@ -72,12 +72,12 @@ unsigned llvm::ComputeLinearIndex(const Type *Ty,
 /// If Offsets is non-null, it points to a vector to be filled in
 /// with the in-memory offsets of each of the individual values.
 ///
-void llvm::ComputeValueVTs(const TargetLowering &TLI, const Type *Ty,
+void llvm::ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
                            SmallVectorImpl<EVT> &ValueVTs,
                            SmallVectorImpl<uint64_t> *Offsets,
                            uint64_t StartingOffset) {
   // Given a struct type, recursively traverse the elements.
-  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
     const StructLayout *SL = TLI.getTargetData()->getStructLayout(STy);
     for (StructType::element_iterator EB = STy->element_begin(),
                                       EI = EB,
@@ -88,8 +88,8 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, const Type *Ty,
     return;
   }
   // Given an array type, recursively traverse the elements.
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    const Type *EltTy = ATy->getElementType();
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
     uint64_t EltSize = TLI.getTargetData()->getTypeAllocSize(EltTy);
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
       ComputeValueVTs(TLI, EltTy, ValueVTs, Offsets,
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 5861fa4..3f23873 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -27,7 +26,6 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7f314ee..1999f36 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -33,7 +33,6 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -45,6 +44,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Timer.h"
 using namespace llvm;
 
@@ -290,10 +290,10 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // Handle common and BSS local symbols (.lcomm).
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
     if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+    unsigned Align = 1 << AlignLog;
 
     // Handle common symbols.
     if (GVKind.isCommon()) {
-      unsigned Align = 1 << AlignLog;
       if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
         Align = 0;
 
@@ -307,17 +307,17 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       const MCSection *TheSection =
         getObjFileLowering().SectionForGlobal(GV, GVKind, Mang, TM);
       // .zerofill __DATA, __bss, _foo, 400, 5
-      OutStreamer.EmitZerofill(TheSection, GVSym, Size, 1 << AlignLog);
+      OutStreamer.EmitZerofill(TheSection, GVSym, Size, Align);
       return;
     }
 
-    if (MAI->hasLCOMMDirective()) {
+    if (MAI->getLCOMMDirectiveType() != LCOMM::None &&
+        (MAI->getLCOMMDirectiveType() != LCOMM::NoAlignment || Align == 1)) {
       // .lcomm _foo, 42
-      OutStreamer.EmitLocalCommonSymbol(GVSym, Size);
+      OutStreamer.EmitLocalCommonSymbol(GVSym, Size, Align);
       return;
     }
 
-    unsigned Align = 1 << AlignLog;
     if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
       Align = 0;
 
@@ -474,8 +474,10 @@ void AsmPrinter::EmitFunctionHeader() {
 void AsmPrinter::EmitFunctionEntryLabel() {
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
-  if (CurrentFnSym->isUndefined())
+  if (CurrentFnSym->isUndefined()) {
+    OutStreamer.ForceCodeRegion();
     return OutStreamer.EmitLabel(CurrentFnSym);
+  }
 
   report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
                      "' label emitted multiple times to assembly file");
@@ -620,6 +622,9 @@ void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
   if (needsCFIMoves() == CFI_M_None)
     return;
 
+  if (MMI->getCompactUnwindEncoding() != 0)
+    OutStreamer.EmitCompactUnwindEncoding(MMI->getCompactUnwindEncoding());
+
   MachineModuleInfo &MMI = MF->getMMI();
   std::vector<MachineMove> &Moves = MMI.getFrameMoves();
   bool FoundOne = false;
@@ -878,7 +883,7 @@ bool AsmPrinter::doFinalization(Module &M) {
          I != E; ++I) {
       MCSymbol *Name = Mang->getSymbol(I);
 
-      const GlobalValue *GV = cast<GlobalValue>(I->getAliasedGlobal());
+      const GlobalValue *GV = I->getAliasedGlobal();
       MCSymbol *Target = Mang->getSymbol(GV);
 
       if (I->hasExternalLinkage() || !MAI->getWeakRefDirective())
@@ -1009,7 +1014,7 @@ void AsmPrinter::EmitConstantPool() {
       unsigned NewOffset = (Offset + AlignMask) & ~AlignMask;
       OutStreamer.EmitFill(NewOffset - Offset, 0/*fillval*/, 0/*addrspace*/);
 
-      const Type *Ty = CPE.getType();
+      Type *Ty = CPE.getType();
       Offset = NewOffset + TM.getTargetData()->getTypeAllocSize(Ty);
       OutStreamer.EmitLabel(GetCPISymbol(CPI));
 
@@ -1055,6 +1060,15 @@ void AsmPrinter::EmitJumpTableInfo() {
 
   EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getTargetData())));
 
+  // If we know the form of the jump table, go ahead and tag it as such.
+  if (!JTInDiffSection) {
+    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32) {
+      OutStreamer.EmitJumpTable32Region();
+    } else {
+      OutStreamer.EmitDataRegion();
+    }
+  }
+
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
 
@@ -1226,22 +1240,53 @@ void AsmPrinter::EmitLLVMUsedList(const Constant *List) {
   }
 }
 
-/// EmitXXStructorList - Emit the ctor or dtor list.  This just prints out the
-/// function pointers, ignoring the init priority.
+typedef std::pair<int, Constant*> Structor;
+
+static bool priority_order(const Structor& lhs, const Structor& rhs) {
+  return lhs.first < rhs.first;
+}
+
+/// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
+/// priority.
 void AsmPrinter::EmitXXStructorList(const Constant *List) {
   // Should be an array of '{ int, void ()* }' structs.  The first value is the
-  // init priority, which we ignore.
+  // init priority.
   if (!isa<ConstantArray>(List)) return;
-  const ConstantArray *InitList = cast<ConstantArray>(List);
-  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
-    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
-      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
-
-      if (CS->getOperand(1)->isNullValue())
-        return;  // Found a null terminator, exit printing.
-      // Emit the function pointer.
-      EmitGlobalConstant(CS->getOperand(1));
-    }
+
+  // Sanity check the structors list.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(List);
+  if (!InitList) return; // Not an array!
+  StructType *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
+  if (!ETy || ETy->getNumElements() != 2) return; // Not an array of pairs!
+  if (!isa<IntegerType>(ETy->getTypeAtIndex(0U)) ||
+      !isa<PointerType>(ETy->getTypeAtIndex(1U))) return; // Not (int, ptr).
+
+  // Gather the structors in a form that's convenient for sorting by priority.
+  SmallVector<Structor, 8> Structors;
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
+    if (!CS) continue; // Malformed.
+    if (CS->getOperand(1)->isNullValue())
+      break;  // Found a null terminator, skip the rest.
+    ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+    if (!Priority) continue; // Malformed.
+    Structors.push_back(std::make_pair(Priority->getLimitedValue(65535),
+                                       CS->getOperand(1)));
+  }
+
+  // Emit the function pointers in reverse priority order.
+  switch (MAI->getStructorOutputOrder()) {
+  case Structors::None:
+    break;
+  case Structors::PriorityOrder:
+    std::sort(Structors.begin(), Structors.end(), priority_order);
+    break;
+  case Structors::ReversePriorityOrder:
+    std::sort(Structors.rbegin(), Structors.rend(), priority_order);
+    break;
+  }
+  for (unsigned i = 0, e = Structors.size(); i != e; ++i)
+    EmitGlobalConstant(Structors[i].second);
 }
 
 //===--------------------------------------------------------------------===//
@@ -1406,8 +1451,7 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
     // Generate a symbolic expression for the byte address
     const Constant *PtrVal = CE->getOperand(0);
     SmallVector<Value*, 8> IdxVec(CE->op_begin()+1, CE->op_end());
-    int64_t Offset = TD.getIndexedOffset(PtrVal->getType(), &IdxVec[0],
-                                         IdxVec.size());
+    int64_t Offset = TD.getIndexedOffset(PtrVal->getType(), IdxVec);
 
     const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
     if (Offset == 0)
@@ -1447,7 +1491,7 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
-    const Type *Ty = CE->getType();
+    Type *Ty = CE->getType();
 
     const MCExpr *OpExpr = LowerConstant(Op, AP);
 
@@ -1496,12 +1540,67 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
 static void EmitGlobalConstantImpl(const Constant *C, unsigned AddrSpace,
                                    AsmPrinter &AP);
 
+/// isRepeatedByteSequence - Determine whether the given value is
+/// composed of a repeated sequence of identical bytes and return the
+/// byte value.  If it is not a repeated sequence, return -1.
+static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getBitWidth() > 64) return -1;
+
+    uint64_t Size = TM.getTargetData()->getTypeAllocSize(V->getType());
+    uint64_t Value = CI->getZExtValue();
+
+    // Make sure the constant is at least 8 bits long and has a power
+    // of 2 bit width.  This guarantees the constant bit width is
+    // always a multiple of 8 bits, avoiding issues with padding out
+    // to Size and other such corner cases.
+    if (CI->getBitWidth() < 8 || !isPowerOf2_64(CI->getBitWidth())) return -1;
+
+    uint8_t Byte = static_cast<uint8_t>(Value);
+
+    for (unsigned i = 1; i < Size; ++i) {
+      Value >>= 8;
+      if (static_cast<uint8_t>(Value) != Byte) return -1;
+    }
+    return Byte;
+  }
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(V)) {
+    // Make sure all array elements are sequences of the same repeated
+    // byte.
+    if (CA->getNumOperands() == 0) return -1;
+
+    int Byte = isRepeatedByteSequence(CA->getOperand(0), TM);
+    if (Byte == -1) return -1;
+
+    for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
+      int ThisByte = isRepeatedByteSequence(CA->getOperand(i), TM);
+      if (ThisByte == -1) return -1;
+      if (Byte != ThisByte) return -1;
+    }
+    return Byte;
+  }
+
+  return -1;
+}
+
 static void EmitGlobalConstantArray(const ConstantArray *CA, unsigned AddrSpace,
                                     AsmPrinter &AP) {
   if (AddrSpace != 0 || !CA->isString()) {
-    // Not a string.  Print the values in successive locations
-    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-      EmitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
+    // Not a string.  Print the values in successive locations.
+
+    // See if we can aggregate some values.  Make sure it can be
+    // represented as a series of bytes of the constant value.
+    int Value = isRepeatedByteSequence(CA, AP.TM);
+
+    if (Value != -1) {
+      uint64_t Bytes = AP.TM.getTargetData()->getTypeAllocSize(CA->getType());
+      AP.OutStreamer.EmitFill(Bytes, Value, AddrSpace);
+    }
+    else {
+      for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
+        EmitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
+    }
     return;
   }
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index dd5b0e2..4d6c281 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -13,7 +13,7 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 5ac455e..8eda889 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -23,15 +23,15 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -49,7 +49,7 @@ namespace {
 static void SrcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
   SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo);
   assert(DiagInfo && "Diagnostic context not passed down?");
-  
+
   // If the inline asm had metadata associated with it, pull out a location
   // cookie corresponding to which line the error occurred on.
   unsigned LocCookie = 0;
@@ -57,13 +57,13 @@ static void SrcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
     unsigned ErrorLine = Diag.getLineNo()-1;
     if (ErrorLine >= LocInfo->getNumOperands())
       ErrorLine = 0;
-    
+
     if (LocInfo->getNumOperands() != 0)
       if (const ConstantInt *CI =
           dyn_cast<ConstantInt>(LocInfo->getOperand(ErrorLine)))
         LocCookie = CI->getZExtValue();
   }
-  
+
   DiagInfo->DiagHandler(Diag, DiagInfo->DiagContext, LocCookie);
 }
 
@@ -109,7 +109,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode) const {
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
   SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
 
-  OwningPtr<MCAsmParser> Parser(createMCAsmParser(TM.getTarget(), SrcMgr,
+  OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr,
                                                   OutContext, OutStreamer,
                                                   *MAI));
 
@@ -121,7 +121,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode) const {
     STI(TM.getTarget().createMCSubtargetInfo(TM.getTargetTriple(),
                                              TM.getTargetCPU(),
                                              TM.getTargetFeatureString()));
-  OwningPtr<TargetAsmParser> TAP(TM.getTarget().createAsmParser(*STI, *Parser));
+  OwningPtr<MCTargetAsmParser>
+    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 4da7876..67d9273 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -12,3 +12,12 @@ add_llvm_library(LLVMAsmPrinter
   Win64Exception.cpp
   )
 
+add_llvm_library_dependencies(LLVMAsmPrinter
+  LLVMAnalysis
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  LLVMTarget
+  )
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 21396ca..9c1ce76 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -69,7 +69,7 @@ void DIEAbbrev::Emit(AsmPrinter *AP) const {
     // Emit attribute type.
     // FIXME: Doing work even in non-asm-verbose runs.
     AP->EmitULEB128(AttrData.getAttribute(),
-                              dwarf::AttributeString(AttrData.getAttribute()));
+                    dwarf::AttributeString(AttrData.getAttribute()));
 
     // Emit form type.
     // FIXME: Doing work even in non-asm-verbose runs.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 91b7d08..8ed4f4c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -17,7 +17,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -77,7 +77,8 @@ void DwarfCFIException::EndModule() {
     // This is a temporary hack to keep sections in the same order they
     // were before. This lets us produce bit identical outputs while
     // transitioning to CFI.
-    Asm->OutStreamer.SwitchSection(TLOF.getEHFrameSection());
+    Asm->OutStreamer.SwitchSection(
+               const_cast<TargetLoweringObjectFile&>(TLOF).getEHFrameSection());
   }
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 1fe035e..88b7524 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -16,7 +16,10 @@
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/Constants.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
 #include "llvm/Analysis/DIBuilder.h"
+#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -132,8 +135,8 @@ void CompileUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
   unsigned Line = G.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->GetOrCreateSourceID(G.getContext().getFilename(),
-                                            G.getContext().getDirectory());
+  unsigned FileID = DD->GetOrCreateSourceID(G.getFilename(),
+                                            G.getDirectory());
   assert(FileID && "Invalid file id");
   addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
   addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
@@ -439,27 +442,36 @@ void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
   addBlock(Die, Attribute, 0, Block);
 }
 
+/// isTypeSigned - Return true if the type is signed.
+static bool isTypeSigned(DIType Ty, int *SizeInBits) {
+  if (Ty.isDerivedType())
+    return isTypeSigned(DIDerivedType(Ty).getTypeDerivedFrom(), SizeInBits);
+  if (Ty.isBasicType())
+    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed
+        || DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
+      *SizeInBits = Ty.getSizeInBits();
+      return true;
+    }
+  return false;
+}
+
 /// addConstantValue - Add constant value entry in variable DIE.
 bool CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
                                    DIType Ty) {
   assert (MO.isImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-  unsigned form = dwarf::DW_FORM_udata;
-  switch (Ty.getSizeInBits()) {
-    case 8: form = dwarf::DW_FORM_data1; break;
-    case 16: form = dwarf::DW_FORM_data2; break;
-    case 32: form = dwarf::DW_FORM_data4; break;
-    case 64: form = dwarf::DW_FORM_data8; break;
+  int SizeInBits = -1;
+  bool SignedConstant = isTypeSigned(Ty, &SizeInBits);
+  unsigned Form = SignedConstant ? dwarf::DW_FORM_sdata : dwarf::DW_FORM_udata;
+  switch (SizeInBits) {
+    case 8:  Form = dwarf::DW_FORM_data1; break;
+    case 16: Form = dwarf::DW_FORM_data2; break;
+    case 32: Form = dwarf::DW_FORM_data4; break;
+    case 64: Form = dwarf::DW_FORM_data8; break;
     default: break;
   }
-
-  DIBasicType BTy(Ty);
-  if (BTy.Verify() &&
-      (BTy.getEncoding()  == dwarf::DW_ATE_signed 
-       || BTy.getEncoding() == dwarf::DW_ATE_signed_char))
-    addSInt(Block, 0, form, MO.getImm());
-  else
-    addUInt(Block, 0, form, MO.getImm());
+  SignedConstant ? addSInt(Block, 0, Form, MO.getImm()) 
+    : addUInt(Block, 0, Form, MO.getImm());
 
   addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
   return true;
@@ -555,7 +567,7 @@ void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
     DIE *ContextDIE = getOrCreateNameSpace(DINameSpace(Context));
     ContextDIE->addChild(Die);
   } else if (Context.isSubprogram()) {
-    DIE *ContextDIE = DD->createSubprogramDIE(DISubprogram(Context));
+    DIE *ContextDIE = getOrCreateSubprogramDIE(DISubprogram(Context));
     ContextDIE->addChild(Die);
   } else if (DIE *ContextDIE = getDIE(Context))
     ContextDIE->addChild(Die);
@@ -565,7 +577,10 @@ void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
 
 /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
 /// given DIType.
-DIE *CompileUnit::getOrCreateTypeDIE(DIType Ty) {
+DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
+  DIType Ty(TyNode);
+  if (!Ty.Verify())
+    return NULL;
   DIE *TyDIE = getDIE(Ty);
   if (TyDIE)
     return TyDIE;
@@ -617,7 +632,8 @@ void CompileUnit::addType(DIE *Entity, DIType Ty) {
 void CompileUnit::addGlobalType(DIType Ty) {
   DIDescriptor Context = Ty.getContext();
   if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl() 
-      && (Context.isCompileUnit() || Context.isFile() || Context.isNameSpace()))
+      && (!Context || Context.isCompileUnit() || Context.isFile() 
+          || Context.isNameSpace()))
     if (DIEEntry *Entry = getDIEEntry(Ty))
       GlobalTypes[Ty.getName()] = Entry->getEntry();
 }
@@ -642,13 +658,20 @@ void CompileUnit::addPubTypes(DISubprogram SP) {
 void CompileUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
   // Get core information.
   StringRef Name = BTy.getName();
-  Buffer.setTag(dwarf::DW_TAG_base_type);
-  addUInt(&Buffer, dwarf::DW_AT_encoding,  dwarf::DW_FORM_data1,
-          BTy.getEncoding());
-
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, dwarf::DW_FORM_string, Name);
+
+  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type) {
+    Buffer.setTag(dwarf::DW_TAG_unspecified_type);
+    // Unspecified types has only name, nothing else.
+    return;
+  }
+
+  Buffer.setTag(dwarf::DW_TAG_base_type);
+  addUInt(&Buffer, dwarf::DW_AT_encoding,  dwarf::DW_FORM_data1,
+	  BTy.getEncoding());
+
   uint64_t Size = BTy.getSizeInBits() >> 3;
   addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
 }
@@ -752,7 +775,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
       DIE *ElemDie = NULL;
       if (Element.isSubprogram()) {
         DISubprogram SP(Element);
-        ElemDie = DD->createSubprogramDIE(DISubprogram(Element));
+        ElemDie = getOrCreateSubprogramDIE(DISubprogram(Element));
         if (SP.isProtected())
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
                   dwarf::DW_ACCESS_protected);
@@ -880,6 +903,218 @@ DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
   return NDie;
 }
 
+/// getRealLinkageName - If special LLVM prefix that is used to inform the asm
+/// printer to not emit usual symbol prefix before the symbol name is used then
+/// return linkage name after skipping this special LLVM prefix.
+static StringRef getRealLinkageName(StringRef LinkageName) {
+  char One = '\1';
+  if (LinkageName.startswith(StringRef(&One, 1)))
+    return LinkageName.substr(1);
+  return LinkageName;
+}
+
+/// getOrCreateSubprogramDIE - Create new DIE using SP.
+DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
+  DIE *SPDie = getDIE(SP);
+  if (SPDie)
+    return SPDie;
+
+  SPDie = new DIE(dwarf::DW_TAG_subprogram);
+  
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  insertDIE(SP, SPDie);
+  
+  // Add to context owner.
+  addToContextOwner(SPDie, SP.getContext());
+
+  // Add function template parameters.
+  addTemplateParams(*SPDie, SP.getTemplateParams());
+
+  StringRef LinkageName = SP.getLinkageName();
+  if (!LinkageName.empty())
+    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, 
+                    dwarf::DW_FORM_string,
+                    getRealLinkageName(LinkageName));
+
+  // If this DIE is going to refer declaration info using AT_specification
+  // then there is no need to add other attributes.
+  if (SP.getFunctionDeclaration().isSubprogram())
+    return SPDie;
+
+  // Constructors and operators for anonymous aggregates do not have names.
+  if (!SP.getName().empty())
+    addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, 
+                    SP.getName());
+
+  addSourceLine(SPDie, SP);
+
+  if (SP.isPrototyped()) 
+    addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+
+  // Add Return Type.
+  DICompositeType SPTy = SP.getType();
+  DIArray Args = SPTy.getTypeArray();
+  unsigned SPTag = SPTy.getTag();
+
+  if (Args.getNumElements() == 0 || SPTag != dwarf::DW_TAG_subroutine_type)
+    addType(SPDie, SPTy);
+  else
+    addType(SPDie, DIType(Args.getElement(0)));
+
+  unsigned VK = SP.getVirtuality();
+  if (VK) {
+    addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK);
+    DIEBlock *Block = getDIEBlock();
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
+    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
+    ContainingTypeMap.insert(std::make_pair(SPDie,
+                                            SP.getContainingType()));
+  }
+
+  if (!SP.isDefinition()) {
+    addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+    
+    // Add arguments. Do not add arguments for subprogram definition. They will
+    // be handled while processing variables.
+    DICompositeType SPTy = SP.getType();
+    DIArray Args = SPTy.getTypeArray();
+    unsigned SPTag = SPTy.getTag();
+
+    if (SPTag == dwarf::DW_TAG_subroutine_type)
+      for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
+        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        DIType ATy = DIType(DIType(Args.getElement(i)));
+        addType(Arg, ATy);
+        if (ATy.isArtificial())
+          addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+        SPDie->addChild(Arg);
+      }
+  }
+
+  if (SP.isArtificial())
+    addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+
+  if (!SP.isLocalToUnit())
+    addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+
+  if (SP.isOptimized())
+    addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+
+  if (unsigned isa = Asm->getISAEncoding()) {
+    addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+  }
+
+  return SPDie;
+}
+
+// Return const expression if value is a GEP to access merged global
+// constant. e.g.
+// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
+static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
+  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
+  if (!CE || CE->getNumOperands() != 3 ||
+      CE->getOpcode() != Instruction::GetElementPtr)
+    return NULL;
+
+  // First operand points to a global struct.
+  Value *Ptr = CE->getOperand(0);
+  if (!isa<GlobalValue>(Ptr) ||
+      !isa<StructType>(cast<PointerType>(Ptr->getType())->getElementType()))
+    return NULL;
+
+  // Second operand is zero.
+  const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
+  if (!CI || !CI->isZero())
+    return NULL;
+
+  // Third operand is offset.
+  if (!isa<ConstantInt>(CE->getOperand(2)))
+    return NULL;
+
+  return CE;
+}
+
+/// createGlobalVariableDIE - create global variable DIE.
+void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
+  // Check for pre-existence.
+  if (getDIE(N))
+    return;
+
+  DIGlobalVariable GV(N);
+  if (!GV.Verify())
+    return;
+
+  DIE *VariableDIE = new DIE(GV.getTag());
+  // Add to map.
+  insertDIE(N, VariableDIE);
+
+  // Add name.
+  addString(VariableDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string,
+                   GV.getDisplayName());
+  StringRef LinkageName = GV.getLinkageName();
+  bool isGlobalVariable = GV.getGlobal() != NULL;
+  if (!LinkageName.empty() && isGlobalVariable)
+    addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name, 
+                     dwarf::DW_FORM_string,
+                     getRealLinkageName(LinkageName));
+  // Add type.
+  DIType GTy = GV.getType();
+  addType(VariableDIE, GTy);
+
+  // Add scoping info.
+  if (!GV.isLocalToUnit()) {
+    addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    // Expose as global. 
+    addGlobal(GV.getName(), VariableDIE);
+  }
+  // Add line number info.
+  addSourceLine(VariableDIE, GV);
+  // Add to context owner.
+  DIDescriptor GVContext = GV.getContext();
+  addToContextOwner(VariableDIE, GVContext);
+  // Add location.
+  if (isGlobalVariable) {
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Block, 0, dwarf::DW_FORM_udata,
+             Asm->Mang->getSymbol(GV.getGlobal()));
+    // Do not create specification DIE if context is either compile unit
+    // or a subprogram.
+    if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
+        !GVContext.isFile() && !isSubprogramContext(GVContext)) {
+      // Create specification DIE.
+      DIE *VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
+      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
+                  dwarf::DW_FORM_ref4, VariableDIE);
+      addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
+      addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag,
+                     1);
+      addDie(VariableSpecDIE);
+    } else {
+      addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+    } 
+  } else if (const ConstantInt *CI = 
+             dyn_cast_or_null<ConstantInt>(GV.getConstant()))
+    addConstantValue(VariableDIE, CI, GTy.isUnsignedDIType());
+  else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
+    // GV is a merged global.
+    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+    Value *Ptr = CE->getOperand(0);
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Block, 0, dwarf::DW_FORM_udata,
+                    Asm->Mang->getSymbol(cast<GlobalValue>(Ptr)));
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SmallVector<Value*, 3> Idx(CE->op_begin()+1, CE->op_end());
+    addUInt(Block, 0, dwarf::DW_FORM_udata, 
+                   Asm->getTargetData().getIndexedOffset(Ptr->getType(), Idx));
+    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+  }
+
+  return;
+}
+
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
   DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
@@ -944,6 +1179,128 @@ DIE *CompileUnit::constructEnumTypeDIE(DIEnumerator ETy) {
   return Enumerator;
 }
 
+/// constructContainingTypeDIEs - Construct DIEs for types that contain
+/// vtables.
+void CompileUnit::constructContainingTypeDIEs() {
+  for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
+         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
+    DIE *SPDie = CI->first;
+    const MDNode *N = CI->second;
+    if (!N) continue;
+    DIE *NDie = getDIE(N);
+    if (!NDie) continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+  }
+}
+
+/// constructVariableDIE - Construct a DIE for the given DbgVariable.
+DIE *CompileUnit::constructVariableDIE(DbgVariable *DV, bool isScopeAbstract) {
+  StringRef Name = DV->getName();
+  if (Name.empty())
+    return NULL;
+
+  // Translate tag to proper Dwarf tag.
+  unsigned Tag = DV->getTag();
+
+  // Define variable debug information entry.
+  DIE *VariableDie = new DIE(Tag);
+  DbgVariable *AbsVar = DV->getAbstractVariable();
+  DIE *AbsDIE = AbsVar ? AbsVar->getDIE() : NULL;
+  if (AbsDIE)
+    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
+                            dwarf::DW_FORM_ref4, AbsDIE);
+  else {
+    addString(VariableDie, dwarf::DW_AT_name, 
+                          dwarf::DW_FORM_string, Name);
+    addSourceLine(VariableDie, DV->getVariable());
+    addType(VariableDie, DV->getType());
+  }
+
+  if (DV->isArtificial())
+    addUInt(VariableDie, dwarf::DW_AT_artificial,
+                        dwarf::DW_FORM_flag, 1);
+
+  if (isScopeAbstract) {
+    DV->setDIE(VariableDie);
+    return VariableDie;
+  }
+
+  // Add variable address.
+
+  unsigned Offset = DV->getDotDebugLocOffset();
+  if (Offset != ~0U) {
+    addLabel(VariableDie, dwarf::DW_AT_location,
+                         dwarf::DW_FORM_data4,
+                         Asm->GetTempSymbol("debug_loc", Offset));
+    DV->setDIE(VariableDie);
+    return VariableDie;
+  }
+
+  // Check if variable is described by a DBG_VALUE instruction.
+  if (const MachineInstr *DVInsn = DV->getMInsn()) {
+    bool updated = false;
+    if (DVInsn->getNumOperands() == 3) {
+      if (DVInsn->getOperand(0).isReg()) {
+        const MachineOperand RegOp = DVInsn->getOperand(0);
+        const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
+        if (DVInsn->getOperand(1).isImm() &&
+            TRI->getFrameRegister(*Asm->MF) == RegOp.getReg()) {
+          unsigned FrameReg = 0;
+          const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
+          int Offset = 
+            TFI->getFrameIndexReference(*Asm->MF, 
+                                        DVInsn->getOperand(1).getImm(), 
+                                        FrameReg);
+          MachineLocation Location(FrameReg, Offset);
+          addVariableAddress(DV, VariableDie, Location);
+          
+        } else if (RegOp.getReg())
+          addVariableAddress(DV, VariableDie, 
+                                         MachineLocation(RegOp.getReg()));
+        updated = true;
+      }
+      else if (DVInsn->getOperand(0).isImm())
+        updated = 
+          addConstantValue(VariableDie, DVInsn->getOperand(0),
+                                       DV->getType());
+      else if (DVInsn->getOperand(0).isFPImm())
+        updated =
+          addConstantFPValue(VariableDie, DVInsn->getOperand(0));
+      else if (DVInsn->getOperand(0).isCImm())
+        updated =
+          addConstantValue(VariableDie, 
+                                       DVInsn->getOperand(0).getCImm(),
+                                       DV->getType().isUnsignedDIType());
+    } else {
+      addVariableAddress(DV, VariableDie, 
+                                     Asm->getDebugValueLocation(DVInsn));
+      updated = true;
+    }
+    if (!updated) {
+      // If variableDie is not updated then DBG_VALUE instruction does not
+      // have valid variable info.
+      delete VariableDie;
+      return NULL;
+    }
+    DV->setDIE(VariableDie);
+    return VariableDie;
+  } else {
+    // .. else use frame index.
+    int FI = DV->getFrameIndex();
+    if (FI != ~0) {
+      unsigned FrameReg = 0;
+      const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
+      int Offset = 
+        TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+      MachineLocation Location(FrameReg, Offset);
+      addVariableAddress(DV, VariableDie, Location);
+    }
+  }
+
+  DV->setDIE(VariableDie);
+  return VariableDie;
+}
+
 /// createMemberDIE - Create new member DIE.
 DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
   DIE *MemberDie = new DIE(DT.getTag());
@@ -1013,7 +1370,7 @@ DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
             dwarf::DW_ACCESS_private);
   // Otherwise C++ member and base classes are considered public.
-  else if (DT.getCompileUnit().getLanguage() == dwarf::DW_LANG_C_plus_plus)
+  else 
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_flag,
             dwarf::DW_ACCESS_public);
   if (DT.isVirtual())
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 213c7fc..7859265 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -67,6 +67,11 @@ class CompileUnit {
   /// DIEBlocks - A list of all the DIEBlocks in use.
   std::vector<DIEBlock *> DIEBlocks;
 
+  /// ContainingTypeMap - This map is used to keep track of subprogram DIEs that
+  /// need DW_AT_containing_type attribute. This attribute points to a DIE that
+  /// corresponds to the MDNode mapped with the subprogram DIE.
+  DenseMap<DIE *, const MDNode *> ContainingTypeMap;
+
 public:
   CompileUnit(unsigned I, DIE *D, AsmPrinter *A, DwarfDebug *DW);
   ~CompileUnit();
@@ -226,9 +231,12 @@ public:
   /// getOrCreateNameSpace - Create a DIE for DINameSpace.
   DIE *getOrCreateNameSpace(DINameSpace NS);
 
+  /// getOrCreateSubprogramDIE - Create new DIE using SP.
+  DIE *getOrCreateSubprogramDIE(DISubprogram SP);
+
   /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
   /// given DIType.
-  DIE *getOrCreateTypeDIE(DIType Ty);
+  DIE *getOrCreateTypeDIE(const MDNode *N);
 
   /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE 
   /// for the given DITemplateTypeParameter.
@@ -242,6 +250,9 @@ public:
   /// information entry.
   DIEEntry *createDIEEntry(DIE *Entry);
 
+  /// createGlobalVariableDIE - create global variable DIE.
+  void createGlobalVariableDIE(const MDNode *N);
+
   void addPubTypes(DISubprogram SP);
 
   /// constructTypeDIE - Construct basic type die from DIBasicType.
@@ -266,6 +277,13 @@ public:
   /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
   DIE *constructEnumTypeDIE(DIEnumerator ETy);
 
+  /// constructContainingTypeDIEs - Construct DIEs for types that contain
+  /// vtables.
+  void constructContainingTypeDIEs();
+
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable *DV, bool isScopeAbstract);
+
   /// createMemberDIE - Create new member DIE.
   DIE *createMemberDIE(DIDerivedType DT);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 125e1e8..1b7e370 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -24,7 +24,6 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -45,9 +44,6 @@
 #include "llvm/Support/Path.h"
 using namespace llvm;
 
-static cl::opt<bool> PrintDbgScope("print-dbgscope", cl::Hidden,
-     cl::desc("Print DbgScope information for each machine instruction"));
-
 static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print",
                                               cl::Hidden,
      cl::desc("Disable debug info printing"));
@@ -69,7 +65,7 @@ static const unsigned InitAbbreviationsSetSize = 9; // log2(512)
 
 namespace llvm {
 
-DIType DbgVariable::getType()               const {
+DIType DbgVariable::getType() const {
   DIType Ty = Var.getType();
   // FIXME: isBlockByrefVariable should be reformulated in terms of complex
   // addresses instead.
@@ -120,141 +116,12 @@ DIType DbgVariable::getType()               const {
   return Ty;
 }
 
-//===----------------------------------------------------------------------===//
-/// DbgRange - This is used to track range of instructions with identical
-/// debug info scope.
-///
-typedef std::pair<const MachineInstr *, const MachineInstr *> DbgRange;
-
-//===----------------------------------------------------------------------===//
-/// DbgScope - This class is used to track scope information.
-///
-class DbgScope {
-  DbgScope *Parent;                   // Parent to this scope.
-  DIDescriptor Desc;                  // Debug info descriptor for scope.
-  // Location at which this scope is inlined.
-  AssertingVH<const MDNode> InlinedAtLocation;
-  bool AbstractScope;                 // Abstract Scope
-  const MachineInstr *LastInsn;       // Last instruction of this scope.
-  const MachineInstr *FirstInsn;      // First instruction of this scope.
-  unsigned DFSIn, DFSOut;
-  // Scopes defined in scope.  Contents not owned.
-  SmallVector<DbgScope *, 4> Scopes;
-  // Variables declared in scope.  Contents owned.
-  SmallVector<DbgVariable *, 8> Variables;
-  SmallVector<DbgRange, 4> Ranges;
-  // Private state for dump()
-  mutable unsigned IndentLevel;
-public:
-  DbgScope(DbgScope *P, DIDescriptor D, const MDNode *I = 0)
-    : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(false),
-      LastInsn(0), FirstInsn(0),
-      DFSIn(0), DFSOut(0), IndentLevel(0) {}
-  virtual ~DbgScope();
-
-  // Accessors.
-  DbgScope *getParent()          const { return Parent; }
-  void setParent(DbgScope *P)          { Parent = P; }
-  DIDescriptor getDesc()         const { return Desc; }
-  const MDNode *getInlinedAt()         const { return InlinedAtLocation; }
-  const MDNode *getScopeNode()         const { return Desc; }
-  const SmallVector<DbgScope *, 4> &getScopes() { return Scopes; }
-  const SmallVector<DbgVariable *, 8> &getDbgVariables() { return Variables; }
-  const SmallVector<DbgRange, 4> &getRanges() { return Ranges; }
-
-  /// openInsnRange - This scope covers instruction range starting from MI.
-  void openInsnRange(const MachineInstr *MI) {
-    if (!FirstInsn)
-      FirstInsn = MI;
-
-    if (Parent)
-      Parent->openInsnRange(MI);
-  }
-
-  /// extendInsnRange - Extend the current instruction range covered by
-  /// this scope.
-  void extendInsnRange(const MachineInstr *MI) {
-    assert (FirstInsn && "MI Range is not open!");
-    LastInsn = MI;
-    if (Parent)
-      Parent->extendInsnRange(MI);
-  }
-
-  /// closeInsnRange - Create a range based on FirstInsn and LastInsn collected
-  /// until now. This is used when a new scope is encountered while walking
-  /// machine instructions.
-  void closeInsnRange(DbgScope *NewScope = NULL) {
-    assert (LastInsn && "Last insn missing!");
-    Ranges.push_back(DbgRange(FirstInsn, LastInsn));
-    FirstInsn = NULL;
-    LastInsn = NULL;
-    // If Parent dominates NewScope then do not close Parent's instruction
-    // range.
-    if (Parent && (!NewScope || !Parent->dominates(NewScope)))
-      Parent->closeInsnRange(NewScope);
-  }
-
-  void setAbstractScope() { AbstractScope = true; }
-  bool isAbstractScope() const { return AbstractScope; }
-
-  // Depth First Search support to walk and mainpluate DbgScope hierarchy.
-  unsigned getDFSOut() const { return DFSOut; }
-  void setDFSOut(unsigned O) { DFSOut = O; }
-  unsigned getDFSIn() const  { return DFSIn; }
-  void setDFSIn(unsigned I)  { DFSIn = I; }
-  bool dominates(const DbgScope *S) {
-    if (S == this)
-      return true;
-    if (DFSIn < S->getDFSIn() && DFSOut > S->getDFSOut())
-      return true;
-    return false;
-  }
-
-  /// addScope - Add a scope to the scope.
-  ///
-  void addScope(DbgScope *S) { Scopes.push_back(S); }
-
-  /// addVariable - Add a variable to the scope.
-  ///
-  void addVariable(DbgVariable *V) { Variables.push_back(V); }
-
-#ifndef NDEBUG
-  void dump() const;
-#endif
-};
-
 } // end llvm namespace
 
-#ifndef NDEBUG
-void DbgScope::dump() const {
-  raw_ostream &err = dbgs();
-  err.indent(IndentLevel);
-  err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
-  const MDNode *N = Desc;
-  N->dump();
-  if (AbstractScope)
-    err << "Abstract Scope\n";
-
-  IndentLevel += 2;
-  if (!Scopes.empty())
-    err << "Children ...\n";
-  for (unsigned i = 0, e = Scopes.size(); i != e; ++i)
-    if (Scopes[i] != this)
-      Scopes[i]->dump();
-
-  IndentLevel -= 2;
-}
-#endif
-
-DbgScope::~DbgScope() {
-  for (unsigned j = 0, M = Variables.size(); j < M; ++j)
-    delete Variables[j];
-}
-
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   : Asm(A), MMI(Asm->MMI), FirstCU(0),
     AbbreviationsSet(InitAbbreviationsSetSize),
-    CurrentFnDbgScope(0), PrevLabel(NULL) {
+    PrevLabel(NULL) {
   NextStringPoolNumber = 0;
 
   DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
@@ -311,147 +178,12 @@ static StringRef getRealLinkageName(StringRef LinkageName) {
   return LinkageName;
 }
 
-/// createSubprogramDIE - Create new DIE using SP.
-DIE *DwarfDebug::createSubprogramDIE(DISubprogram SP) {
-  CompileUnit *SPCU = getCompileUnit(SP);
-  DIE *SPDie = SPCU->getDIE(SP);
-  if (SPDie)
-    return SPDie;
-
-  SPDie = new DIE(dwarf::DW_TAG_subprogram);
-  
-  // DW_TAG_inlined_subroutine may refer to this DIE.
-  SPCU->insertDIE(SP, SPDie);
-  
-  // Add to context owner.
-  SPCU->addToContextOwner(SPDie, SP.getContext());
-
-  // Add function template parameters.
-  SPCU->addTemplateParams(*SPDie, SP.getTemplateParams());
-
-  StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty())
-    SPCU->addString(SPDie, dwarf::DW_AT_MIPS_linkage_name, dwarf::DW_FORM_string,
-                    getRealLinkageName(LinkageName));
-
-  // If this DIE is going to refer declaration info using AT_specification
-  // then there is no need to add other attributes.
-  if (SP.getFunctionDeclaration().isSubprogram())
-    return SPDie;
-
-  // Constructors and operators for anonymous aggregates do not have names.
-  if (!SP.getName().empty())
-    SPCU->addString(SPDie, dwarf::DW_AT_name, dwarf::DW_FORM_string, 
-                    SP.getName());
-
-  SPCU->addSourceLine(SPDie, SP);
-
-  if (SP.isPrototyped()) 
-    SPCU->addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
-
-  // Add Return Type.
-  DICompositeType SPTy = SP.getType();
-  DIArray Args = SPTy.getTypeArray();
-  unsigned SPTag = SPTy.getTag();
-
-  if (Args.getNumElements() == 0 || SPTag != dwarf::DW_TAG_subroutine_type)
-    SPCU->addType(SPDie, SPTy);
-  else
-    SPCU->addType(SPDie, DIType(Args.getElement(0)));
-
-  unsigned VK = SP.getVirtuality();
-  if (VK) {
-    SPCU->addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_flag, VK);
-    DIEBlock *Block = SPCU->getDIEBlock();
-    SPCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    SPCU->addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
-    SPCU->addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
-    ContainingTypeMap.insert(std::make_pair(SPDie,
-                                            SP.getContainingType()));
-  }
-
-  if (!SP.isDefinition()) {
-    SPCU->addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-    
-    // Add arguments. Do not add arguments for subprogram definition. They will
-    // be handled while processing variables.
-    DICompositeType SPTy = SP.getType();
-    DIArray Args = SPTy.getTypeArray();
-    unsigned SPTag = SPTy.getTag();
-
-    if (SPTag == dwarf::DW_TAG_subroutine_type)
-      for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
-        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-        DIType ATy = DIType(DIType(Args.getElement(i)));
-        SPCU->addType(Arg, ATy);
-        if (ATy.isArtificial())
-          SPCU->addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
-        SPDie->addChild(Arg);
-      }
-  }
-
-  if (SP.isArtificial())
-    SPCU->addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
-
-  if (!SP.isLocalToUnit())
-    SPCU->addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
-
-  if (SP.isOptimized())
-    SPCU->addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
-
-  if (unsigned isa = Asm->getISAEncoding()) {
-    SPCU->addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
-  }
-
-  return SPDie;
-}
-
-DbgScope *DwarfDebug::getOrCreateAbstractScope(const MDNode *N) {
-  assert(N && "Invalid Scope encoding!");
-
-  DbgScope *AScope = AbstractScopes.lookup(N);
-  if (AScope)
-    return AScope;
-
-  DbgScope *Parent = NULL;
-
-  DIDescriptor Scope(N);
-  if (Scope.isLexicalBlock()) {
-    DILexicalBlock DB(N);
-    DIDescriptor ParentDesc = DB.getContext();
-    Parent = getOrCreateAbstractScope(ParentDesc);
-  }
-
-  AScope = new DbgScope(Parent, DIDescriptor(N), NULL);
-
-  if (Parent)
-    Parent->addScope(AScope);
-  AScope->setAbstractScope();
-  AbstractScopes[N] = AScope;
-  if (DIDescriptor(N).isSubprogram())
-    AbstractScopesList.push_back(AScope);
-  return AScope;
-}
-
-/// isSubprogramContext - Return true if Context is either a subprogram
-/// or another context nested inside a subprogram.
-static bool isSubprogramContext(const MDNode *Context) {
-  if (!Context)
-    return false;
-  DIDescriptor D(Context);
-  if (D.isSubprogram())
-    return true;
-  if (D.isType())
-    return isSubprogramContext(DIType(Context).getContext());
-  return false;
-}
-
 /// updateSubprogramScopeDIE - Find DIE for the given subprogram and
 /// attach appropriate DW_AT_low_pc and DW_AT_high_pc attributes.
 /// If there are global variables in this scope then create and insert
 /// DIEs for these variables.
-DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
-  CompileUnit *SPCU = getCompileUnit(SPNode);
+DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
+                                          const MDNode *SPNode) {
   DIE *SPDie = SPCU->getDIE(SPNode);
 
   assert(SPDie && "Unable to find subprogram DIE!");
@@ -461,7 +193,7 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
   if (SPDecl.isSubprogram())
     // Refer function declaration directly.
     SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
-                      createSubprogramDIE(SPDecl));
+                      SPCU->getOrCreateSubprogramDIE(SPDecl));
   else {
     // There is not any need to generate specification DIE for a function
     // defined at compile unit level. If a function is defined inside another
@@ -514,25 +246,26 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(const MDNode *SPNode) {
 
 /// constructLexicalScope - Construct new DW_TAG_lexical_block
 /// for this scope and attach DW_AT_low_pc/DW_AT_high_pc labels.
-DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
+DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU, 
+                                          LexicalScope *Scope) {
 
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_lexical_block);
   if (Scope->isAbstractScope())
     return ScopeDIE;
 
-  const SmallVector<DbgRange, 4> &Ranges = Scope->getRanges();
+  const SmallVector<InsnRange, 4> &Ranges = Scope->getRanges();
   if (Ranges.empty())
     return 0;
 
-  CompileUnit *TheCU = getCompileUnit(Scope->getScopeNode());
-  SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin();
+  SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin();
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
     // .debug_range as a uint, size 4, for now. emitDIE will handle
     // DW_AT_ranges appropriately.
     TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
-                   DebugRangeSymbols.size() * Asm->getTargetData().getPointerSize());
-    for (SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin(),
+                   DebugRangeSymbols.size() 
+                   * Asm->getTargetData().getPointerSize());
+    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
          RE = Ranges.end(); RI != RE; ++RI) {
       DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
       DebugRangeSymbols.push_back(getLabelAfterInsn(RI->second));
@@ -559,22 +292,29 @@ DIE *DwarfDebug::constructLexicalScopeDIE(DbgScope *Scope) {
 /// constructInlinedScopeDIE - This scope represents inlined body of
 /// a function. Construct DIE to represent this concrete inlined copy
 /// of the function.
-DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
+DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
+                                          LexicalScope *Scope) {
 
-  const SmallVector<DbgRange, 4> &Ranges = Scope->getRanges();
+  const SmallVector<InsnRange, 4> &Ranges = Scope->getRanges();
   assert (Ranges.empty() == false
-          && "DbgScope does not have instruction markers!");
+          && "LexicalScope does not have instruction markers!");
 
-  // FIXME : .debug_inlined section specification does not clearly state how
-  // to emit inlined scope that is split into multiple instruction ranges.
-  // For now, use first instruction range and emit low_pc/high_pc pair and
-  // corresponding .debug_inlined section entry for this pair.
-  SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin();
+  if (!Scope->getScopeNode())
+    return NULL;
+  DIScope DS(Scope->getScopeNode());
+  DISubprogram InlinedSP = getDISubprogram(DS);
+  DIE *OriginDIE = TheCU->getDIE(InlinedSP);
+  if (!OriginDIE) {
+    DEBUG(dbgs() << "Unable to find original DIE for inlined subprogram.");
+    return NULL;
+  }
+
+  SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin();
   const MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
   const MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
 
   if (StartLabel == 0 || EndLabel == 0) {
-    assert (0 && "Unexpected Start and End  labels for a inlined scope!");
+    assert (0 && "Unexpected Start and End labels for a inlined scope!");
     return 0;
   }
   assert(StartLabel->isDefined() &&
@@ -582,26 +322,38 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
   assert(EndLabel->isDefined() &&
          "Invalid end label for an inlined scope!");
 
-  if (!Scope->getScopeNode())
-    return NULL;
-  DIScope DS(Scope->getScopeNode());
-  DISubprogram InlinedSP = getDISubprogram(DS);
-  CompileUnit *TheCU = getCompileUnit(InlinedSP);
-  DIE *OriginDIE = TheCU->getDIE(InlinedSP);
-  if (!OriginDIE) {
-    DEBUG(dbgs() << "Unable to find original DIE for inlined subprogram.");
-    return NULL;
-  }
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
   TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
                      dwarf::DW_FORM_ref4, OriginDIE);
 
-  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, StartLabel);
-  TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, EndLabel);
+  if (Ranges.size() > 1) {
+    // .debug_range section has not been laid out yet. Emit offset in
+    // .debug_range as a uint, size 4, for now. emitDIE will handle
+    // DW_AT_ranges appropriately.
+    TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
+                   DebugRangeSymbols.size() 
+                   * Asm->getTargetData().getPointerSize());
+    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
+         RE = Ranges.end(); RI != RE; ++RI) {
+      DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
+      DebugRangeSymbols.push_back(getLabelAfterInsn(RI->second));
+    }
+    DebugRangeSymbols.push_back(NULL);
+    DebugRangeSymbols.push_back(NULL);
+  } else {
+    TheCU->addLabel(ScopeDIE, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 
+                    StartLabel);
+    TheCU->addLabel(ScopeDIE, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr, 
+                    EndLabel);
+  }
 
   InlinedSubprogramDIEs.insert(OriginDIE);
 
   // Track the start label for this inlined function.
+  //.debug_inlined section specification does not clearly state how
+  // to emit inlined scope that is split into multiple instruction ranges.
+  // For now, use first instruction range and emit low_pc/high_pc pair and
+  // corresponding .debug_inlined section entry for this pair.
   DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> >::iterator
     I = InlineInfo.find(InlinedSP);
 
@@ -619,200 +371,51 @@ DIE *DwarfDebug::constructInlinedScopeDIE(DbgScope *Scope) {
   return ScopeDIE;
 }
 
-/// isUnsignedDIType - Return true if type encoding is unsigned.
-static bool isUnsignedDIType(DIType Ty) {
-  DIDerivedType DTy(Ty);
-  if (DTy.Verify())
-    return isUnsignedDIType(DTy.getTypeDerivedFrom());
-
-  DIBasicType BTy(Ty);
-  if (BTy.Verify()) {
-    unsigned Encoding = BTy.getEncoding();
-    if (Encoding == dwarf::DW_ATE_unsigned ||
-        Encoding == dwarf::DW_ATE_unsigned_char)
-      return true;
-  }
-  return false;
-}
-
-/// constructVariableDIE - Construct a DIE for the given DbgVariable.
-DIE *DwarfDebug::constructVariableDIE(DbgVariable *DV, DbgScope *Scope) {
-  StringRef Name = DV->getName();
-  if (Name.empty())
-    return NULL;
-
-  // Translate tag to proper Dwarf tag.  The result variable is dropped for
-  // now.
-  unsigned Tag;
-  switch (DV->getTag()) {
-  case dwarf::DW_TAG_return_variable:
-    return NULL;
-  case dwarf::DW_TAG_arg_variable:
-    Tag = dwarf::DW_TAG_formal_parameter;
-    break;
-  case dwarf::DW_TAG_auto_variable:    // fall thru
-  default:
-    Tag = dwarf::DW_TAG_variable;
-    break;
-  }
-
-  // Define variable debug information entry.
-  DIE *VariableDie = new DIE(Tag);
-  CompileUnit *VariableCU = getCompileUnit(DV->getVariable());
-  DIE *AbsDIE = NULL;
-  DenseMap<const DbgVariable *, const DbgVariable *>::iterator
-    V2AVI = VarToAbstractVarMap.find(DV);
-  if (V2AVI != VarToAbstractVarMap.end())
-    AbsDIE = V2AVI->second->getDIE();
-
-  if (AbsDIE)
-    VariableCU->addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
-                       dwarf::DW_FORM_ref4, AbsDIE);
-  else {
-    VariableCU->addString(VariableDie, dwarf::DW_AT_name, dwarf::DW_FORM_string,
-                          Name);
-    VariableCU->addSourceLine(VariableDie, DV->getVariable());
-
-    // Add variable type.
-    VariableCU->addType(VariableDie, DV->getType());
-  }
-
-  if (Tag == dwarf::DW_TAG_formal_parameter && DV->getType().isArtificial())
-    VariableCU->addUInt(VariableDie, dwarf::DW_AT_artificial, 
-                        dwarf::DW_FORM_flag, 1);
-  else if (DIVariable(DV->getVariable()).isArtificial())
-    VariableCU->addUInt(VariableDie, dwarf::DW_AT_artificial, 
-                        dwarf::DW_FORM_flag, 1);
-
-  if (Scope->isAbstractScope()) {
-    DV->setDIE(VariableDie);
-    return VariableDie;
-  }
-
-  // Add variable address.
-
-  unsigned Offset = DV->getDotDebugLocOffset();
-  if (Offset != ~0U) {
-    VariableCU->addLabel(VariableDie, dwarf::DW_AT_location, dwarf::DW_FORM_data4,
-             Asm->GetTempSymbol("debug_loc", Offset));
-    DV->setDIE(VariableDie);
-    UseDotDebugLocEntry.insert(VariableDie);
-    return VariableDie;
-  }
-
-  // Check if variable is described by a  DBG_VALUE instruction.
-  DenseMap<const DbgVariable *, const MachineInstr *>::iterator DVI =
-    DbgVariableToDbgInstMap.find(DV);
-  if (DVI != DbgVariableToDbgInstMap.end()) {
-    const MachineInstr *DVInsn = DVI->second;
-    bool updated = false;
-    // FIXME : Handle getNumOperands != 3
-    if (DVInsn->getNumOperands() == 3) {
-      if (DVInsn->getOperand(0).isReg()) {
-        const MachineOperand RegOp = DVInsn->getOperand(0);
-        const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
-        if (DVInsn->getOperand(1).isImm() &&
-            TRI->getFrameRegister(*Asm->MF) == RegOp.getReg()) {
-          unsigned FrameReg = 0;
-          const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-          int Offset = 
-            TFI->getFrameIndexReference(*Asm->MF, 
-                                        DVInsn->getOperand(1).getImm(), 
-                                        FrameReg);
-          MachineLocation Location(FrameReg, Offset);
-          VariableCU->addVariableAddress(DV, VariableDie, Location);
-          
-        } else if (RegOp.getReg())
-          VariableCU->addVariableAddress(DV, VariableDie, 
-                                         MachineLocation(RegOp.getReg()));
-        updated = true;
-      }
-      else if (DVInsn->getOperand(0).isImm())
-        updated = 
-          VariableCU->addConstantValue(VariableDie, DVInsn->getOperand(0),
-                                       DV->getType());
-      else if (DVInsn->getOperand(0).isFPImm())
-        updated =
-          VariableCU->addConstantFPValue(VariableDie, DVInsn->getOperand(0));
-      else if (DVInsn->getOperand(0).isCImm())
-        updated =
-          VariableCU->addConstantValue(VariableDie, 
-                                       DVInsn->getOperand(0).getCImm(),
-                                       isUnsignedDIType(DV->getType()));
-    } else {
-      VariableCU->addVariableAddress(DV, VariableDie, 
-                                     Asm->getDebugValueLocation(DVInsn));
-      updated = true;
-    }
-    if (!updated) {
-      // If variableDie is not updated then DBG_VALUE instruction does not
-      // have valid variable info.
-      delete VariableDie;
-      return NULL;
-    }
-    DV->setDIE(VariableDie);
-    return VariableDie;
-  }
-
-  // .. else use frame index, if available.
-  int FI = 0;
-  if (findVariableFrameIndex(DV, &FI)) {
-    unsigned FrameReg = 0;
-    const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-    int Offset = 
-      TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    MachineLocation Location(FrameReg, Offset);
-    VariableCU->addVariableAddress(DV, VariableDie, Location);
-  }
-
-  DV->setDIE(VariableDie);
-  return VariableDie;
-
-}
-
 /// constructScopeDIE - Construct a DIE for this scope.
-DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
+DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
   if (!Scope || !Scope->getScopeNode())
     return NULL;
 
   SmallVector <DIE *, 8> Children;
 
   // Collect arguments for current function.
-  if (Scope == CurrentFnDbgScope)
+  if (LScopes.isCurrentFunctionScope(Scope))
     for (unsigned i = 0, N = CurrentFnArguments.size(); i < N; ++i)
       if (DbgVariable *ArgDV = CurrentFnArguments[i])
-        if (DIE *Arg = constructVariableDIE(ArgDV, Scope))
+        if (DIE *Arg = 
+            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope()))
           Children.push_back(Arg);
 
-  // Collect lexical scope childrens first.
-  const SmallVector<DbgVariable *, 8> &Variables = Scope->getDbgVariables();
+  // Collect lexical scope children first.
+  const SmallVector<DbgVariable *, 8> &Variables = ScopeVariables.lookup(Scope);
   for (unsigned i = 0, N = Variables.size(); i < N; ++i)
-    if (DIE *Variable = constructVariableDIE(Variables[i], Scope))
+    if (DIE *Variable = 
+        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope()))
       Children.push_back(Variable);
-  const SmallVector<DbgScope *, 4> &Scopes = Scope->getScopes();
+  const SmallVector<LexicalScope *, 4> &Scopes = Scope->getChildren();
   for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
-    if (DIE *Nested = constructScopeDIE(Scopes[j]))
+    if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
       Children.push_back(Nested);
   DIScope DS(Scope->getScopeNode());
   DIE *ScopeDIE = NULL;
   if (Scope->getInlinedAt())
-    ScopeDIE = constructInlinedScopeDIE(Scope);
+    ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
   else if (DS.isSubprogram()) {
     ProcessedSPNodes.insert(DS);
     if (Scope->isAbstractScope()) {
-      ScopeDIE = getCompileUnit(DS)->getDIE(DS);
+      ScopeDIE = TheCU->getDIE(DS);
       // Note down abstract DIE.
       if (ScopeDIE)
         AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
     }
     else
-      ScopeDIE = updateSubprogramScopeDIE(DS);
+      ScopeDIE = updateSubprogramScopeDIE(TheCU, DS);
   }
   else {
     // There is no need to emit empty lexical block DIE.
     if (Children.empty())
       return NULL;
-    ScopeDIE = constructLexicalScopeDIE(Scope);
+    ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
   }
   
   if (!ScopeDIE) return NULL;
@@ -823,7 +426,7 @@ DIE *DwarfDebug::constructScopeDIE(DbgScope *Scope) {
     ScopeDIE->addChild(*I);
 
   if (DS.isSubprogram())
-    getCompileUnit(DS)->addPubTypes(DISubprogram(DS));
+   TheCU->addPubTypes(DISubprogram(DS));
 
  return ScopeDIE;
 }
@@ -862,7 +465,7 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
 
 /// constructCompileUnit - Create new CompileUnit for the given
 /// metadata node with tag DW_TAG_compile_unit.
-void DwarfDebug::constructCompileUnit(const MDNode *N) {
+CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   DICompileUnit DIUnit(N);
   StringRef FN = DIUnit.getFilename();
   StringRef Dir = DIUnit.getDirectory();
@@ -893,7 +496,8 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   StringRef Flags = DIUnit.getFlags();
   if (!Flags.empty())
-    NewCU->addString(Die, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string, Flags);
+    NewCU->addString(Die, dwarf::DW_AT_APPLE_flags, dwarf::DW_FORM_string, 
+                     Flags);
   
   unsigned RVer = DIUnit.getRunTimeVersion();
   if (RVer)
@@ -903,159 +507,19 @@ void DwarfDebug::constructCompileUnit(const MDNode *N) {
   if (!FirstCU)
     FirstCU = NewCU;
   CUMap.insert(std::make_pair(N, NewCU));
-}
-
-/// getCompielUnit - Get CompileUnit DIE.
-CompileUnit *DwarfDebug::getCompileUnit(const MDNode *N) const {
-  assert (N && "Invalid DwarfDebug::getCompileUnit argument!");
-  DIDescriptor D(N);
-  const MDNode *CUNode = NULL;
-  if (D.isCompileUnit())
-    CUNode = N;
-  else if (D.isSubprogram())
-    CUNode = DISubprogram(N).getCompileUnit();
-  else if (D.isType())
-    CUNode = DIType(N).getCompileUnit();
-  else if (D.isGlobalVariable())
-    CUNode = DIGlobalVariable(N).getCompileUnit();
-  else if (D.isVariable())
-    CUNode = DIVariable(N).getCompileUnit();
-  else if (D.isNameSpace())
-    CUNode = DINameSpace(N).getCompileUnit();
-  else if (D.isFile())
-    CUNode = DIFile(N).getCompileUnit();
-  else
-    return FirstCU;
-
-  DenseMap<const MDNode *, CompileUnit *>::const_iterator I
-    = CUMap.find(CUNode);
-  if (I == CUMap.end())
-    return FirstCU;
-  return I->second;
-}
-
-// Return const exprssion if value is a GEP to access merged global
-// constant. e.g.
-// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
-static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
-  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
-  if (!CE || CE->getNumOperands() != 3 ||
-      CE->getOpcode() != Instruction::GetElementPtr)
-    return NULL;
-
-  // First operand points to a global value.
-  if (!isa<GlobalValue>(CE->getOperand(0)))
-    return NULL;
-
-  // Second operand is zero.
-  const ConstantInt *CI = 
-    dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
-  if (!CI || !CI->isZero())
-    return NULL;
-
-  // Third operand is offset.
-  if (!isa<ConstantInt>(CE->getOperand(2)))
-    return NULL;
-
-  return CE;
-}
-
-/// constructGlobalVariableDIE - Construct global variable DIE.
-void DwarfDebug::constructGlobalVariableDIE(const MDNode *N) {
-  DIGlobalVariable GV(N);
-
-  // If debug information is malformed then ignore it.
-  if (GV.Verify() == false)
-    return;
-
-  // Check for pre-existence.
-  CompileUnit *TheCU = getCompileUnit(N);
-  if (TheCU->getDIE(GV))
-    return;
-
-  DIType GTy = GV.getType();
-  DIE *VariableDIE = new DIE(GV.getTag());
-
-  bool isGlobalVariable = GV.getGlobal() != NULL;
-
-  // Add name.
-  TheCU->addString(VariableDIE, dwarf::DW_AT_name, dwarf::DW_FORM_string,
-                   GV.getDisplayName());
-  StringRef LinkageName = GV.getLinkageName();
-  if (!LinkageName.empty() && isGlobalVariable)
-    TheCU->addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name, 
-                     dwarf::DW_FORM_string,
-                     getRealLinkageName(LinkageName));
-  // Add type.
-  TheCU->addType(VariableDIE, GTy);
-
-  // Add scoping info.
-  if (!GV.isLocalToUnit()) {
-    TheCU->addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
-    // Expose as global. 
-    TheCU->addGlobal(GV.getName(), VariableDIE);
-  }
-  // Add line number info.
-  TheCU->addSourceLine(VariableDIE, GV);
-  // Add to map.
-  TheCU->insertDIE(N, VariableDIE);
-  // Add to context owner.
-  DIDescriptor GVContext = GV.getContext();
-  TheCU->addToContextOwner(VariableDIE, GVContext);
-  // Add location.
-  if (isGlobalVariable) {
-    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    TheCU->addLabel(Block, 0, dwarf::DW_FORM_udata,
-             Asm->Mang->getSymbol(GV.getGlobal()));
-    // Do not create specification DIE if context is either compile unit
-    // or a subprogram.
-    if (GV.isDefinition() && !GVContext.isCompileUnit() &&
-        !GVContext.isFile() && !isSubprogramContext(GVContext)) {
-      // Create specification DIE.
-      DIE *VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
-      TheCU->addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
-                  dwarf::DW_FORM_ref4, VariableDIE);
-      TheCU->addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
-      TheCU->addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-      TheCU->addDie(VariableSpecDIE);
-    } else {
-      TheCU->addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
-    } 
-  } else if (const ConstantInt *CI = 
-             dyn_cast_or_null<ConstantInt>(GV.getConstant()))
-    TheCU->addConstantValue(VariableDIE, CI, isUnsignedDIType(GTy));
-  else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
-    // GV is a merged global.
-    DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    TheCU->addLabel(Block, 0, dwarf::DW_FORM_udata,
-                    Asm->Mang->getSymbol(cast<GlobalValue>(CE->getOperand(0))));
-    ConstantInt *CII = cast<ConstantInt>(CE->getOperand(2));
-    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    TheCU->addUInt(Block, 0, dwarf::DW_FORM_udata, CII->getZExtValue());
-    TheCU->addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    TheCU->addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
-  }
-
-  return;
+  return NewCU;
 }
 
 /// construct SubprogramDIE - Construct subprogram DIE.
-void DwarfDebug::constructSubprogramDIE(const MDNode *N) {
+void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU, 
+                                        const MDNode *N) {
   DISubprogram SP(N);
-
-  // Check for pre-existence.
-  CompileUnit *TheCU = getCompileUnit(N);
-  if (TheCU->getDIE(N))
-    return;
-
   if (!SP.isDefinition())
     // This is a method declaration which will be handled while constructing
     // class type.
     return;
 
-  DIE *SubprogramDie = createSubprogramDIE(SP);
+  DIE *SubprogramDie = TheCU->getOrCreateSubprogramDIE(SP);
 
   // Add to map.
   TheCU->insertDIE(N, SubprogramDie);
@@ -1066,71 +530,115 @@ void DwarfDebug::constructSubprogramDIE(const MDNode *N) {
   // Expose as global.
   TheCU->addGlobal(SP.getName(), SubprogramDie);
 
+  SPMap[N] = TheCU;
   return;
 }
 
+/// collectInfoFromNamedMDNodes - Collect debug info from named mdnodes such
+/// as llvm.dbg.enum and llvm.dbg.ty
+void DwarfDebug::collectInfoFromNamedMDNodes(Module *M) {
+  if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.sp"))
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      const MDNode *N = NMD->getOperand(i);
+      if (CompileUnit *CU = CUMap.lookup(DISubprogram(N).getCompileUnit()))
+        constructSubprogramDIE(CU, N);
+    }
+  
+  if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.gv"))
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      const MDNode *N = NMD->getOperand(i);
+      if (CompileUnit *CU = CUMap.lookup(DIGlobalVariable(N).getCompileUnit()))
+        CU->createGlobalVariableDIE(N);
+    }
+  
+  if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.enum"))
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      DIType Ty(NMD->getOperand(i));
+      if (CompileUnit *CU = CUMap.lookup(Ty.getCompileUnit()))
+        CU->getOrCreateTypeDIE(Ty);
+    }
+  
+  if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.ty"))
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      DIType Ty(NMD->getOperand(i));
+      if (CompileUnit *CU = CUMap.lookup(Ty.getCompileUnit()))
+        CU->getOrCreateTypeDIE(Ty);
+    }
+}
+
+/// collectLegacyDebugInfo - Collect debug info using DebugInfoFinder.
+/// FIXME - Remove this when dragon-egg and llvm-gcc switch to DIBuilder.
+bool DwarfDebug::collectLegacyDebugInfo(Module *M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(*M);
+  
+  bool HasDebugInfo = false;
+  // Scan all the compile-units to see if there are any marked as the main
+  // unit. If not, we do not generate debug info.
+  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+         E = DbgFinder.compile_unit_end(); I != E; ++I) {
+    if (DICompileUnit(*I).isMain()) {
+      HasDebugInfo = true;
+      break;
+    }
+  }
+  if (!HasDebugInfo) return false;
+  
+  // Create all the compile unit DIEs.
+  for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
+         E = DbgFinder.compile_unit_end(); I != E; ++I)
+    constructCompileUnit(*I);
+  
+  // Create DIEs for each global variable.
+  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
+         E = DbgFinder.global_variable_end(); I != E; ++I) {
+    const MDNode *N = *I;
+    if (CompileUnit *CU = CUMap.lookup(DIGlobalVariable(N).getCompileUnit()))
+      CU->createGlobalVariableDIE(N);
+  }
+    
+  // Create DIEs for each subprogram.
+  for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
+         E = DbgFinder.subprogram_end(); I != E; ++I) {
+    const MDNode *N = *I;
+    if (CompileUnit *CU = CUMap.lookup(DISubprogram(N).getCompileUnit()))
+      constructSubprogramDIE(CU, N);
+  }
+
+  return HasDebugInfo;
+}
+
 /// beginModule - Emit all Dwarf sections that should come prior to the
 /// content. Create global DIEs and emit initial debug info sections.
-/// This is inovked by the target AsmPrinter.
+/// This is invoked by the target AsmPrinter.
 void DwarfDebug::beginModule(Module *M) {
   if (DisableDebugInfoPrinting)
     return;
 
-  // If module has named metadata anchors then use them, otherwise scan the module
-  // using debug info finder to collect debug info.
+  // If module has named metadata anchors then use them, otherwise scan the
+  // module using debug info finder to collect debug info.
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (CU_Nodes) {
-
-    NamedMDNode *GV_Nodes = M->getNamedMetadata("llvm.dbg.gv");
-    NamedMDNode *SP_Nodes = M->getNamedMetadata("llvm.dbg.sp");
-    if (!GV_Nodes && !SP_Nodes)
-      // If there are not any global variables or any functions then
-      // there is not any debug info in this module.
-      return;
-
-    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i)
-      constructCompileUnit(CU_Nodes->getOperand(i));
-
-    if (GV_Nodes)
-      for (unsigned i = 0, e = GV_Nodes->getNumOperands(); i != e; ++i)
-        constructGlobalVariableDIE(GV_Nodes->getOperand(i));
-
-    if (SP_Nodes)
-      for (unsigned i = 0, e = SP_Nodes->getNumOperands(); i != e; ++i)
-        constructSubprogramDIE(SP_Nodes->getOperand(i));
-    
-  } else {
-
-    DebugInfoFinder DbgFinder;
-    DbgFinder.processModule(*M);
-    
-    bool HasDebugInfo = false;
-    // Scan all the compile-units to see if there are any marked as the main unit.
-    // if not, we do not generate debug info.
-    for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-           E = DbgFinder.compile_unit_end(); I != E; ++I) {
-      if (DICompileUnit(*I).isMain()) {
-        HasDebugInfo = true;
-        break;
-      }
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      DICompileUnit CUNode(CU_Nodes->getOperand(i));
+      CompileUnit *CU = constructCompileUnit(CUNode);
+      DIArray GVs = CUNode.getGlobalVariables();
+      for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
+        CU->createGlobalVariableDIE(GVs.getElement(i));
+      DIArray SPs = CUNode.getSubprograms();
+      for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
+        constructSubprogramDIE(CU, SPs.getElement(i));
+      DIArray EnumTypes = CUNode.getEnumTypes();
+      for (unsigned i = 0, e = EnumTypes.getNumElements(); i != e; ++i)
+        CU->getOrCreateTypeDIE(EnumTypes.getElement(i));
+      DIArray RetainedTypes = CUNode.getRetainedTypes();
+      for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i)
+        CU->getOrCreateTypeDIE(RetainedTypes.getElement(i));
     }
-    if (!HasDebugInfo) return;
-    
-    // Create all the compile unit DIEs.
-    for (DebugInfoFinder::iterator I = DbgFinder.compile_unit_begin(),
-           E = DbgFinder.compile_unit_end(); I != E; ++I)
-      constructCompileUnit(*I);
-    
-    // Create DIEs for each global variable.
-    for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
-           E = DbgFinder.global_variable_end(); I != E; ++I)
-      constructGlobalVariableDIE(*I);
-    
-    // Create DIEs for each subprogram.
-    for (DebugInfoFinder::iterator I = DbgFinder.subprogram_begin(),
-           E = DbgFinder.subprogram_end(); I != E; ++I)
-      constructSubprogramDIE(*I);
-  }
+  } else if (!collectLegacyDebugInfo(M))
+    return;
+
+  collectInfoFromNamedMDNodes(M);
   
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
@@ -1138,19 +646,6 @@ void DwarfDebug::beginModule(Module *M) {
   // Emit initial sections.
   EmitSectionLabels();
 
-  //getOrCreateTypeDIE
-  if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.enum"))
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-      DIType Ty(NMD->getOperand(i));
-      getCompileUnit(Ty)->getOrCreateTypeDIE(Ty);
-    }
-
-  if (NamedMDNode *NMD = M->getNamedMetadata("llvm.dbg.ty"))
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-      DIType Ty(NMD->getOperand(i));
-      getCompileUnit(Ty)->getOrCreateTypeDIE(Ty);
-    }
-
   // Prime section data.
   SectionMap.insert(Asm->getObjFileLowering().getTextSection());
 }
@@ -1160,38 +655,38 @@ void DwarfDebug::beginModule(Module *M) {
 void DwarfDebug::endModule() {
   if (!FirstCU) return;
   const Module *M = MMI->getModule();
-  DenseMap<const MDNode *, DbgScope *> DeadFnScopeMap;
-  if (NamedMDNode *AllSPs = M->getNamedMetadata("llvm.dbg.sp")) {
-    for (unsigned SI = 0, SE = AllSPs->getNumOperands(); SI != SE; ++SI) {
-      if (ProcessedSPNodes.count(AllSPs->getOperand(SI)) != 0) continue;
-      DISubprogram SP(AllSPs->getOperand(SI));
-      if (!SP.Verify()) continue;
-
-      // Collect info for variables that were optimized out.
-      if (!SP.isDefinition()) continue;
-      StringRef FName = SP.getLinkageName();
-      if (FName.empty())
-        FName = SP.getName();
-      NamedMDNode *NMD = getFnSpecificMDNode(*(MMI->getModule()), FName);
-      if (!NMD) continue;
-      unsigned E = NMD->getNumOperands();
-      if (!E) continue;
-      DbgScope *Scope = new DbgScope(NULL, DIDescriptor(SP), NULL);
-      DeadFnScopeMap[SP] = Scope;
-      for (unsigned I = 0; I != E; ++I) {
-        DIVariable DV(NMD->getOperand(I));
-        if (!DV.Verify()) continue;
-        Scope->addVariable(new DbgVariable(DV));
-      }
+  DenseMap<const MDNode *, LexicalScope *> DeadFnScopeMap;
 
-      // Construct subprogram DIE and add variables DIEs.
-      constructSubprogramDIE(SP);
-      DIE *ScopeDIE = getCompileUnit(SP)->getDIE(SP);
-      const SmallVector<DbgVariable *, 8> &Variables = Scope->getDbgVariables();
-      for (unsigned i = 0, N = Variables.size(); i < N; ++i) {
-        DIE *VariableDIE = constructVariableDIE(Variables[i], Scope);
-        if (VariableDIE)
-          ScopeDIE->addChild(VariableDIE);
+  // Collect info for variables that were optimized out.
+  if (NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu")) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      DICompileUnit TheCU(CU_Nodes->getOperand(i));
+      DIArray Subprograms = TheCU.getSubprograms();
+      for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
+        DISubprogram SP(Subprograms.getElement(i));
+        if (ProcessedSPNodes.count(SP) != 0) continue;
+        if (!SP.Verify()) continue;
+        if (!SP.isDefinition()) continue;
+        DIArray Variables = SP.getVariables();
+        if (Variables.getNumElements() == 0) continue;
+
+        LexicalScope *Scope = 
+          new LexicalScope(NULL, DIDescriptor(SP), NULL, false);
+        DeadFnScopeMap[SP] = Scope;
+        
+        // Construct subprogram DIE and add variables DIEs.
+        CompileUnit *SPCU = CUMap.lookup(TheCU);
+        assert (SPCU && "Unable to find Compile Unit!");
+        constructSubprogramDIE(SPCU, SP);
+        DIE *ScopeDIE = SPCU->getDIE(SP);
+        for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
+          DIVariable DV(Variables.getElement(vi));
+          if (!DV.Verify()) continue;
+          DbgVariable *NewVar = new DbgVariable(DV, NULL);
+          if (DIE *VariableDIE = 
+              SPCU->constructVariableDIE(NewVar, Scope->isAbstractScope()))
+            ScopeDIE->addChild(VariableDIE);
+        }
       }
     }
   }
@@ -1203,15 +698,12 @@ void DwarfDebug::endModule() {
     FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
   }
 
-  for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
-         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
-    DIE *SPDie = CI->first;
-    const MDNode *N = dyn_cast_or_null<MDNode>(CI->second);
-    if (!N) continue;
-    DIE *NDie = getCompileUnit(N)->getDIE(N);
-    if (!NDie) continue;
-    getCompileUnit(N)->addDIEEntry(SPDie, dwarf::DW_AT_containing_type, 
-                                   dwarf::DW_FORM_ref4, NDie);
+  // Emit DW_AT_containing_type attribute to connect types with their
+  // vtable holding type.
+  for (DenseMap<const MDNode *, CompileUnit *>::iterator CUI = CUMap.begin(),
+         CUE = CUMap.end(); CUI != CUE; ++CUI) {
+    CompileUnit *TheCU = CUI->second;
+    TheCU->constructContainingTypeDIEs();
   }
 
   // Standard sections final addresses.
@@ -1261,6 +753,7 @@ void DwarfDebug::endModule() {
 
   // clean up.
   DeleteContainerSeconds(DeadFnScopeMap);
+  SPMap.clear();
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I)
     delete I->second;
@@ -1268,29 +761,30 @@ void DwarfDebug::endModule() {
 }
 
 /// findAbstractVariable - Find abstract variable, if any, associated with Var.
-DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &Var,
+DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
                                               DebugLoc ScopeLoc) {
-
+  LLVMContext &Ctx = DV->getContext();
+  // More then one inlined variable corresponds to one abstract variable.
+  DIVariable Var = cleanseInlinedVariable(DV, Ctx);
   DbgVariable *AbsDbgVariable = AbstractVariables.lookup(Var);
   if (AbsDbgVariable)
     return AbsDbgVariable;
 
-  LLVMContext &Ctx = Var->getContext();
-  DbgScope *Scope = AbstractScopes.lookup(ScopeLoc.getScope(Ctx));
+  LexicalScope *Scope = LScopes.findAbstractScope(ScopeLoc.getScope(Ctx));
   if (!Scope)
     return NULL;
 
-  AbsDbgVariable = new DbgVariable(Var);
-  Scope->addVariable(AbsDbgVariable);
+  AbsDbgVariable = new DbgVariable(Var, NULL);
+  addScopeVariable(Scope, AbsDbgVariable);
   AbstractVariables[Var] = AbsDbgVariable;
   return AbsDbgVariable;
 }
 
-/// addCurrentFnArgument - If Var is an current function argument that add
-/// it in CurrentFnArguments list.
+/// addCurrentFnArgument - If Var is a current function argument then add
+/// it to CurrentFnArguments list.
 bool DwarfDebug::addCurrentFnArgument(const MachineFunction *MF,
-                                      DbgVariable *Var, DbgScope *Scope) {
-  if (Scope != CurrentFnDbgScope) 
+                                      DbgVariable *Var, LexicalScope *Scope) {
+  if (!LScopes.isCurrentFunctionScope(Scope))
     return false;
   DIVariable DV = Var->getVariable();
   if (DV.getTag() != dwarf::DW_TAG_arg_variable)
@@ -1313,7 +807,7 @@ bool DwarfDebug::addCurrentFnArgument(const MachineFunction *MF,
 /// collectVariableInfoFromMMITable - Collect variable information from
 /// side table maintained by MMI.
 void
-DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction * MF,
+DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction *MF,
                                    SmallPtrSet<const MDNode *, 16> &Processed) {
   MachineModuleInfo::VariableDbgInfoMapTy &VMap = MMI->getVariableDbgInfo();
   for (MachineModuleInfo::VariableDbgInfoMapTy::iterator VI = VMap.begin(),
@@ -1324,21 +818,19 @@ DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction * MF,
     DIVariable DV(Var);
     const std::pair<unsigned, DebugLoc> &VP = VI->second;
 
-    DbgScope *Scope = findDbgScope(VP.second);
+    LexicalScope *Scope = LScopes.findLexicalScope(VP.second);
 
     // If variable scope is not found then skip this variable.
     if (Scope == 0)
       continue;
 
     DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VP.second);
-    DbgVariable *RegVar = new DbgVariable(DV);
-    recordVariableFrameIndex(RegVar, VP.first);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable);
+    RegVar->setFrameIndex(VP.first);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
-      Scope->addVariable(RegVar);
-    if (AbsDbgVariable) {
-      recordVariableFrameIndex(AbsDbgVariable, VP.first);
-      VarToAbstractVarMap[RegVar] = AbsDbgVariable;
-    }
+      addScopeVariable(Scope, RegVar);
+    if (AbsDbgVariable)
+      AbsDbgVariable->setFrameIndex(VP.first);
   }
 }
 
@@ -1351,7 +843,7 @@ static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
          MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0;
 }
 
-/// getDebugLocEntry - Get .debug_loc entry for the instraction range starting
+/// getDebugLocEntry - Get .debug_loc entry for the instruction range starting
 /// at MI.
 static DotDebugLocEntry getDebugLocEntry(AsmPrinter *Asm, 
                                          const MCSymbol *FLabel, 
@@ -1379,7 +871,7 @@ static DotDebugLocEntry getDebugLocEntry(AsmPrinter *Asm,
   return DotDebugLocEntry();
 }
 
-/// collectVariableInfo - Populate DbgScope entries with variables' info.
+/// collectVariableInfo - Find variables for each lexical scope.
 void
 DwarfDebug::collectVariableInfo(const MachineFunction *MF,
                                 SmallPtrSet<const MDNode *, 16> &Processed) {
@@ -1402,30 +894,37 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     const MachineInstr *MInsn = History.front();
 
     DIVariable DV(Var);
-    DbgScope *Scope = NULL;
+    LexicalScope *Scope = NULL;
     if (DV.getTag() == dwarf::DW_TAG_arg_variable &&
         DISubprogram(DV.getContext()).describes(MF->getFunction()))
-      Scope = CurrentFnDbgScope;
-    else
-      Scope = findDbgScope(MInsn->getDebugLoc());
+      Scope = LScopes.getCurrentFunctionScope();
+    else {
+      if (DV.getVersion() <= LLVMDebugVersion9)
+        Scope = LScopes.findLexicalScope(MInsn->getDebugLoc());
+      else {
+        if (MDNode *IA = DV.getInlinedAt())
+          Scope = LScopes.findInlinedScope(DebugLoc::getFromDILocation(IA));
+        else
+          Scope = LScopes.findLexicalScope(cast<MDNode>(DV->getOperand(1)));
+      }
+    }
     // If variable scope is not found then skip this variable.
     if (!Scope)
       continue;
 
     Processed.insert(DV);
     assert(MInsn->isDebugValue() && "History must begin with debug value");
-    DbgVariable *RegVar = new DbgVariable(DV);
+    DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
+    DbgVariable *RegVar = new DbgVariable(DV, AbsVar);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
-      Scope->addVariable(RegVar);
-    if (DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc())) {
-      DbgVariableToDbgInstMap[AbsVar] = MInsn;
-      VarToAbstractVarMap[RegVar] = AbsVar;
-    }
+      addScopeVariable(Scope, RegVar);
+    if (AbsVar)
+      AbsVar->setMInsn(MInsn);
 
     // Simple ranges that are fully coalesced.
     if (History.size() <= 1 || (History.size() == 2 &&
                                 MInsn->isIdenticalTo(History.back()))) {
-      DbgVariableToDbgInstMap[RegVar] = MInsn;
+      RegVar->setMInsn(MInsn);
       continue;
     }
 
@@ -1471,16 +970,14 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
   }
 
   // Collect info for variables that were optimized out.
-  const Function *F = MF->getFunction();
-  if (NamedMDNode *NMD = getFnSpecificMDNode(*(F->getParent()), F->getName())) {
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-      DIVariable DV(cast<MDNode>(NMD->getOperand(i)));
-      if (!DV || !Processed.insert(DV))
-        continue;
-      DbgScope *Scope = DbgScopeMap.lookup(DV.getContext());
-      if (Scope)
-        Scope->addVariable(new DbgVariable(DV));
-    }
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  DIArray Variables = DISubprogram(FnScope->getScopeNode()).getVariables();
+  for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
+    DIVariable DV(Variables.getElement(i));
+    if (!DV || !DV.Verify() || !Processed.insert(DV))
+      continue;
+    if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
+      addScopeVariable(Scope, new DbgVariable(DV, NULL));
   }
 }
 
@@ -1561,237 +1058,33 @@ void DwarfDebug::endInstruction(const MachineInstr *MI) {
   I->second = PrevLabel;
 }
 
-/// getOrCreateDbgScope - Create DbgScope for the scope.
-DbgScope *DwarfDebug::getOrCreateDbgScope(DebugLoc DL) {
-  LLVMContext &Ctx = Asm->MF->getFunction()->getContext();
-  MDNode *Scope = NULL;
-  MDNode *InlinedAt = NULL;
-  DL.getScopeAndInlinedAt(Scope, InlinedAt, Ctx);
-
-  if (!InlinedAt) {
-    DbgScope *WScope = DbgScopeMap.lookup(Scope);
-    if (WScope)
-      return WScope;
-    WScope = new DbgScope(NULL, DIDescriptor(Scope), NULL);
-    DbgScopeMap.insert(std::make_pair(Scope, WScope));
-    if (DIDescriptor(Scope).isLexicalBlock()) {
-      DbgScope *Parent =
-        getOrCreateDbgScope(DebugLoc::getFromDILexicalBlock(Scope));
-      WScope->setParent(Parent);
-      Parent->addScope(WScope);
-    } else if (DIDescriptor(Scope).isSubprogram()
-               && DISubprogram(Scope).describes(Asm->MF->getFunction()))
-      CurrentFnDbgScope = WScope;
-
-    return WScope;
-  }
-
-  getOrCreateAbstractScope(Scope);
-  DbgScope *WScope = DbgScopeMap.lookup(InlinedAt);
-  if (WScope)
-    return WScope;
-
-  WScope = new DbgScope(NULL, DIDescriptor(Scope), InlinedAt);
-  DbgScopeMap.insert(std::make_pair(InlinedAt, WScope));
-  InlinedDbgScopeMap[DebugLoc::getFromDILocation(InlinedAt)] = WScope;
-  DbgScope *Parent =
-    getOrCreateDbgScope(DebugLoc::getFromDILocation(InlinedAt));
-  WScope->setParent(Parent);
-  Parent->addScope(WScope);
-  return WScope;
-}
-
-/// calculateDominanceGraph - Calculate dominance graph for DbgScope
-/// hierarchy.
-static void calculateDominanceGraph(DbgScope *Scope) {
-  assert (Scope && "Unable to calculate scop edominance graph!");
-  SmallVector<DbgScope *, 4> WorkStack;
-  WorkStack.push_back(Scope);
-  unsigned Counter = 0;
-  while (!WorkStack.empty()) {
-    DbgScope *WS = WorkStack.back();
-    const SmallVector<DbgScope *, 4> &Children = WS->getScopes();
-    bool visitedChildren = false;
-    for (SmallVector<DbgScope *, 4>::const_iterator SI = Children.begin(),
-           SE = Children.end(); SI != SE; ++SI) {
-      DbgScope *ChildScope = *SI;
-      if (!ChildScope->getDFSOut()) {
-        WorkStack.push_back(ChildScope);
-        visitedChildren = true;
-        ChildScope->setDFSIn(++Counter);
-        break;
-      }
-    }
-    if (!visitedChildren) {
-      WorkStack.pop_back();
-      WS->setDFSOut(++Counter);
-    }
-  }
-}
-
-/// printDbgScopeInfo - Print DbgScope info for each machine instruction.
-static
-void printDbgScopeInfo(const MachineFunction *MF,
-                       DenseMap<const MachineInstr *, DbgScope *> &MI2ScopeMap)
-{
-#ifndef NDEBUG
-  LLVMContext &Ctx = MF->getFunction()->getContext();
-  unsigned PrevDFSIn = 0;
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      const MachineInstr *MInsn = II;
-      MDNode *Scope = NULL;
-      MDNode *InlinedAt = NULL;
-
-      // Check if instruction has valid location information.
-      DebugLoc MIDL = MInsn->getDebugLoc();
-      if (!MIDL.isUnknown()) {
-        MIDL.getScopeAndInlinedAt(Scope, InlinedAt, Ctx);
-        dbgs() << " [ ";
-        if (InlinedAt)
-          dbgs() << "*";
-        DenseMap<const MachineInstr *, DbgScope *>::iterator DI =
-          MI2ScopeMap.find(MInsn);
-        if (DI != MI2ScopeMap.end()) {
-          DbgScope *S = DI->second;
-          dbgs() << S->getDFSIn();
-          PrevDFSIn = S->getDFSIn();
-        } else
-          dbgs() << PrevDFSIn;
-      } else
-        dbgs() << " [ x" << PrevDFSIn;
-      dbgs() << " ]";
-      MInsn->dump();
-    }
-    dbgs() << "\n";
-  }
-#endif
-}
-/// extractScopeInformation - Scan machine instructions in this function
-/// and collect DbgScopes. Return true, if at least one scope was found.
-bool DwarfDebug::extractScopeInformation() {
-  // If scope information was extracted using .dbg intrinsics then there is not
-  // any need to extract these information by scanning each instruction.
-  if (!DbgScopeMap.empty())
-    return false;
-
-  // Scan each instruction and create scopes. First build working set of scopes.
-  SmallVector<DbgRange, 4> MIRanges;
-  DenseMap<const MachineInstr *, DbgScope *> MI2ScopeMap;
-  DebugLoc PrevDL;
-  const MachineInstr *RangeBeginMI = NULL;
-  const MachineInstr *PrevMI = NULL;
-  for (MachineFunction::const_iterator I = Asm->MF->begin(), E = Asm->MF->end();
-       I != E; ++I) {
-    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
-         II != IE; ++II) {
-      const MachineInstr *MInsn = II;
-
-      // Check if instruction has valid location information.
-      const DebugLoc MIDL = MInsn->getDebugLoc();
-      if (MIDL.isUnknown()) {
-        PrevMI = MInsn;
-        continue;
-      }
-
-      // If scope has not changed then skip this instruction.
-      if (MIDL == PrevDL) {
-        PrevMI = MInsn;
-        continue;
-      }
-
-      // Ignore DBG_VALUE. It does not contribute any instruction in output.
-      if (MInsn->isDebugValue())
-        continue;
-
-      if (RangeBeginMI) {
-        // If we have alread seen a beginning of a instruction range and
-        // current instruction scope does not match scope of first instruction
-        // in this range then create a new instruction range.
-        DEBUG(dbgs() << "Creating new instruction range :\n");
-        DEBUG(dbgs() << "Begin Range at " << *RangeBeginMI);
-        DEBUG(dbgs() << "End Range at " << *PrevMI);
-        DEBUG(dbgs() << "Next Range starting at " << *MInsn);
-        DEBUG(dbgs() << "------------------------\n");
-        DbgRange R(RangeBeginMI, PrevMI);
-        MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevDL);
-        MIRanges.push_back(R);
-      }
-
-      // This is a beginning of a new instruction range.
-      RangeBeginMI = MInsn;
-
-      // Reset previous markers.
-      PrevMI = MInsn;
-      PrevDL = MIDL;
-    }
-  }
-
-  // Create last instruction range.
-  if (RangeBeginMI && PrevMI && !PrevDL.isUnknown()) {
-    DbgRange R(RangeBeginMI, PrevMI);
-    MIRanges.push_back(R);
-    MI2ScopeMap[RangeBeginMI] = getOrCreateDbgScope(PrevDL);
-  }
-
-  if (!CurrentFnDbgScope)
-    return false;
-
-  calculateDominanceGraph(CurrentFnDbgScope);
-  if (PrintDbgScope)
-    printDbgScopeInfo(Asm->MF, MI2ScopeMap);
-
-  // Find ranges of instructions covered by each DbgScope;
-  DbgScope *PrevDbgScope = NULL;
-  for (SmallVector<DbgRange, 4>::const_iterator RI = MIRanges.begin(),
-         RE = MIRanges.end(); RI != RE; ++RI) {
-    const DbgRange &R = *RI;
-    DbgScope *S = MI2ScopeMap.lookup(R.first);
-    assert (S && "Lost DbgScope for a machine instruction!");
-    if (PrevDbgScope && !PrevDbgScope->dominates(S))
-      PrevDbgScope->closeInsnRange(S);
-    S->openInsnRange(R.first);
-    S->extendInsnRange(R.second);
-    PrevDbgScope = S;
-  }
-
-  if (PrevDbgScope)
-    PrevDbgScope->closeInsnRange();
-
-  identifyScopeMarkers();
-
-  return !DbgScopeMap.empty();
-}
-
 /// identifyScopeMarkers() -
-/// Each DbgScope has first instruction and last instruction to mark beginning
-/// and end of a scope respectively. Create an inverse map that list scopes
-/// starts (and ends) with an instruction. One instruction may start (or end)
-/// multiple scopes. Ignore scopes that are not reachable.
+/// Each LexicalScope has first instruction and last instruction to mark
+/// beginning and end of a scope respectively. Create an inverse map that list
+/// scopes starts (and ends) with an instruction. One instruction may start (or
+/// end) multiple scopes. Ignore scopes that are not reachable.
 void DwarfDebug::identifyScopeMarkers() {
-  SmallVector<DbgScope *, 4> WorkList;
-  WorkList.push_back(CurrentFnDbgScope);
+  SmallVector<LexicalScope *, 4> WorkList;
+  WorkList.push_back(LScopes.getCurrentFunctionScope());
   while (!WorkList.empty()) {
-    DbgScope *S = WorkList.pop_back_val();
+    LexicalScope *S = WorkList.pop_back_val();
 
-    const SmallVector<DbgScope *, 4> &Children = S->getScopes();
+    const SmallVector<LexicalScope *, 4> &Children = S->getChildren();
     if (!Children.empty())
-      for (SmallVector<DbgScope *, 4>::const_iterator SI = Children.begin(),
+      for (SmallVector<LexicalScope *, 4>::const_iterator SI = Children.begin(),
              SE = Children.end(); SI != SE; ++SI)
         WorkList.push_back(*SI);
 
     if (S->isAbstractScope())
       continue;
 
-    const SmallVector<DbgRange, 4> &Ranges = S->getRanges();
+    const SmallVector<InsnRange, 4> &Ranges = S->getRanges();
     if (Ranges.empty())
       continue;
-    for (SmallVector<DbgRange, 4>::const_iterator RI = Ranges.begin(),
+    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
            RE = Ranges.end(); RI != RE; ++RI) {
-      assert(RI->first && "DbgRange does not have first instruction!");
-      assert(RI->second && "DbgRange does not have second instruction!");
+      assert(RI->first && "InsnRange does not have first instruction!");
+      assert(RI->second && "InsnRange does not have second instruction!");
       requestLabelBeforeInsn(RI->first);
       requestLabelAfterInsn(RI->second);
     }
@@ -1819,7 +1112,9 @@ static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
 /// emitted immediately after the function entry point.
 void DwarfDebug::beginFunction(const MachineFunction *MF) {
   if (!MMI->hasDebugInfo()) return;
-  if (!extractScopeInformation()) return;
+  LScopes.initialize(*MF);
+  if (LScopes.empty()) return;
+  identifyScopeMarkers();
 
   FunctionBeginSym = Asm->GetTempSymbol("func_begin",
                                         Asm->getFunctionNumber());
@@ -1953,7 +1248,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     const MachineInstr *Prev = History.back();
     if (Prev->isDebugValue() && isDbgValueInDefinedReg(Prev)) {
       const MachineBasicBlock *PrevMBB = Prev->getParent();
-      MachineBasicBlock::const_iterator LastMI = PrevMBB->getLastNonDebugInstr();
+      MachineBasicBlock::const_iterator LastMI = 
+        PrevMBB->getLastNonDebugInstr();
       if (LastMI == PrevMBB->end())
         // Drop DBG_VALUE for empty range.
         History.pop_back();
@@ -1985,110 +1281,73 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   }
 }
 
+void DwarfDebug::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+//  SmallVector<DbgVariable *, 8> &Vars = ScopeVariables.lookup(LS);
+  ScopeVariables[LS].push_back(Var);
+//  Vars.push_back(Var);
+}
+
 /// endFunction - Gather and emit post-function debug information.
 ///
 void DwarfDebug::endFunction(const MachineFunction *MF) {
-  if (!MMI->hasDebugInfo() || DbgScopeMap.empty()) return;
-
-  if (CurrentFnDbgScope) {
+  if (!MMI->hasDebugInfo() || LScopes.empty()) return;
 
-    // Define end label for subprogram.
-    FunctionEndSym = Asm->GetTempSymbol("func_end",
-                                        Asm->getFunctionNumber());
-    // Assumes in correct section after the entry point.
-    Asm->OutStreamer.EmitLabel(FunctionEndSym);
-
-    SmallPtrSet<const MDNode *, 16> ProcessedVars;
-    collectVariableInfo(MF, ProcessedVars);
-
-    // Construct abstract scopes.
-    for (SmallVector<DbgScope *, 4>::iterator AI = AbstractScopesList.begin(),
-           AE = AbstractScopesList.end(); AI != AE; ++AI) {
-      DISubprogram SP((*AI)->getScopeNode());
-      if (SP.Verify()) {
-        // Collect info for variables that were optimized out.
-        StringRef FName = SP.getLinkageName();
-        if (FName.empty())
-          FName = SP.getName();
-        if (NamedMDNode *NMD = 
-            getFnSpecificMDNode(*(MF->getFunction()->getParent()), FName)) {
-          for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
-          DIVariable DV(cast<MDNode>(NMD->getOperand(i)));
-          if (!DV || !ProcessedVars.insert(DV))
-            continue;
-          DbgScope *Scope = AbstractScopes.lookup(DV.getContext());
-          if (Scope)
-            Scope->addVariable(new DbgVariable(DV));
-          }
-        }
+  // Define end label for subprogram.
+  FunctionEndSym = Asm->GetTempSymbol("func_end",
+                                      Asm->getFunctionNumber());
+  // Assumes in correct section after the entry point.
+  Asm->OutStreamer.EmitLabel(FunctionEndSym);
+  
+  SmallPtrSet<const MDNode *, 16> ProcessedVars;
+  collectVariableInfo(MF, ProcessedVars);
+  
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
+  assert (TheCU && "Unable to find compile unit!");
+
+  // Construct abstract scopes.
+  ArrayRef<LexicalScope *> AList = LScopes.getAbstractScopesList();
+  for (unsigned i = 0, e = AList.size(); i != e; ++i) {
+    LexicalScope *AScope = AList[i];
+    DISubprogram SP(AScope->getScopeNode());
+    if (SP.Verify()) {
+      // Collect info for variables that were optimized out.
+      DIArray Variables = SP.getVariables();
+      for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
+        DIVariable DV(Variables.getElement(i));
+        if (!DV || !DV.Verify() || !ProcessedVars.insert(DV))
+          continue;
+        if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
+          addScopeVariable(Scope, new DbgVariable(DV, NULL));
       }
-      if (ProcessedSPNodes.count((*AI)->getScopeNode()) == 0)
-        constructScopeDIE(*AI);
     }
-
-    DIE *CurFnDIE = constructScopeDIE(CurrentFnDbgScope);
-
-    if (!DisableFramePointerElim(*MF))
-      getCompileUnit(CurrentFnDbgScope->getScopeNode())->addUInt(CurFnDIE, 
-                                                                 dwarf::DW_AT_APPLE_omit_frame_ptr,
-                                                                 dwarf::DW_FORM_flag, 1);
-
-
-    DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(),
-                                                 MMI->getFrameMoves()));
+    if (ProcessedSPNodes.count(AScope->getScopeNode()) == 0)
+      constructScopeDIE(TheCU, AScope);
   }
+  
+  DIE *CurFnDIE = constructScopeDIE(TheCU, FnScope);
+  
+  if (!DisableFramePointerElim(*MF))
+    TheCU->addUInt(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr,
+                   dwarf::DW_FORM_flag, 1);
+
+  DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(),
+                                               MMI->getFrameMoves()));
 
   // Clear debug info
-  CurrentFnDbgScope = NULL;
+  for (DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> >::iterator
+         I = ScopeVariables.begin(), E = ScopeVariables.end(); I != E; ++I)
+    DeleteContainerPointers(I->second);
+  ScopeVariables.clear();
   DeleteContainerPointers(CurrentFnArguments);
-  DbgVariableToFrameIndexMap.clear();
-  VarToAbstractVarMap.clear();
-  DbgVariableToDbgInstMap.clear();
-  InlinedDbgScopeMap.clear();
-  DeleteContainerSeconds(DbgScopeMap);
   UserVariables.clear();
   DbgValues.clear();
-  DeleteContainerSeconds(AbstractScopes);
-  AbstractScopesList.clear();
   AbstractVariables.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
   PrevLabel = NULL;
 }
 
-/// recordVariableFrameIndex - Record a variable's index.
-void DwarfDebug::recordVariableFrameIndex(const DbgVariable *V, int Index) {
-  assert (V && "Invalid DbgVariable!");
-  DbgVariableToFrameIndexMap[V] = Index;
-}
-
-/// findVariableFrameIndex - Return true if frame index for the variable
-/// is found. Update FI to hold value of the index.
-bool DwarfDebug::findVariableFrameIndex(const DbgVariable *V, int *FI) {
-  assert (V && "Invalid DbgVariable!");
-  DenseMap<const DbgVariable *, int>::iterator I =
-    DbgVariableToFrameIndexMap.find(V);
-  if (I == DbgVariableToFrameIndexMap.end())
-    return false;
-  *FI = I->second;
-  return true;
-}
-
-/// findDbgScope - Find DbgScope for the debug loc.
-DbgScope *DwarfDebug::findDbgScope(DebugLoc DL) {
-  if (DL.isUnknown())
-    return NULL;
-
-  DbgScope *Scope = NULL;
-  LLVMContext &Ctx = Asm->MF->getFunction()->getContext();
-  if (MDNode *IA = DL.getInlinedAt(Ctx))
-    Scope = InlinedDbgScopeMap.lookup(DebugLoc::getFromDILocation(IA));
-  else
-    Scope = DbgScopeMap.lookup(DL.getScope(Ctx));
-  return Scope;
-}
-
-
 /// recordSourceLine - Register a source line with debug info. Returns the
 /// unique label that was emitted and which provides correspondence to
 /// the source line list.
@@ -2112,6 +1371,10 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
       DISubprogram SP(S);
       Fn = SP.getFilename();
       Dir = SP.getDirectory();
+    } else if (Scope.isLexicalBlockFile()) {
+      DILexicalBlockFile DBF(S);
+      Fn = DBF.getFilename();
+      Dir = DBF.getDirectory();
     } else if (Scope.isLexicalBlock()) {
       DILexicalBlock DB(S);
       Fn = DB.getFilename();
@@ -2121,8 +1384,7 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
 
     Src = GetOrCreateSourceID(Fn, Dir);
   }
-  Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, Flags,
-                                         0, 0, Fn);
+  Asm->OutStreamer.EmitDwarfLocDirective(Src, Line, Col, Flags, 0, 0, Fn);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2235,7 +1497,7 @@ void DwarfDebug::EmitSectionLabels() {
   EmitSectionSym(Asm, TLOF.getDataSection());
 }
 
-/// emitDIE - Recusively Emits a debug information entry.
+/// emitDIE - Recursively emits a debug information entry.
 ///
 void DwarfDebug::emitDIE(DIE *Die) {
   // Get the abbreviation for this DIE.
@@ -2290,10 +1552,9 @@ void DwarfDebug::emitDIE(DIE *Die) {
       break;
     }
     case dwarf::DW_AT_location: {
-      if (UseDotDebugLocEntry.count(Die) != 0) {
-        DIELabel *L = cast<DIELabel>(Values[i]);
+      if (DIELabel *L = dyn_cast<DIELabel>(Values[i]))
         Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
-      } else
+      else
         Values[i]->EmitValue(Asm, Form);
       break;
     }
@@ -2464,7 +1725,7 @@ void DwarfDebug::emitDebugPubNames() {
     Asm->OutStreamer.AddComment("End Mark");
     Asm->EmitInt32(0);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_end",
-                                                TheCU->getID()));
+                                                  TheCU->getID()));
   }
 }
 
@@ -2499,7 +1760,7 @@ void DwarfDebug::emitDebugPubTypes() {
     for (StringMap<DIE*>::const_iterator
            GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
-      DIE * Entity = GI->second;
+      DIE *Entity = GI->second;
 
       if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index b245006..35653be 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -15,7 +15,8 @@
 #define CODEGEN_ASMPRINTER_DWARFDEBUG_H__
 
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "DIE.h"
 #include "llvm/ADT/DenseMap.h"
@@ -30,7 +31,6 @@ namespace llvm {
 
 class CompileUnit;
 class DbgConcreteScope;
-class DbgScope;
 class DbgVariable;
 class MachineFrameInfo;
 class MachineModuleInfo;
@@ -125,9 +125,14 @@ class DbgVariable {
   DIVariable Var;                    // Variable Descriptor.
   DIE *TheDIE;                       // Variable DIE.
   unsigned DotDebugLocOffset;        // Offset in DotDebugLocEntries.
+  DbgVariable *AbsVar;               // Corresponding Abstract variable, if any.
+  const MachineInstr *MInsn;         // DBG_VALUE instruction of the variable.
+  int FrameIndex;
 public:
   // AbsVar may be NULL.
-  DbgVariable(DIVariable V) : Var(V), TheDIE(0), DotDebugLocOffset(~0U) {}
+  DbgVariable(DIVariable V, DbgVariable *AV) 
+    : Var(V), TheDIE(0), DotDebugLocOffset(~0U), AbsVar(AV), MInsn(0),
+      FrameIndex(~0) {}
 
   // Accessors.
   DIVariable getVariable()           const { return Var; }
@@ -136,7 +141,27 @@ public:
   void setDotDebugLocOffset(unsigned O)    { DotDebugLocOffset = O; }
   unsigned getDotDebugLocOffset()    const { return DotDebugLocOffset; }
   StringRef getName()                const { return Var.getName(); }
-  unsigned getTag()                  const { return Var.getTag(); }
+  DbgVariable *getAbstractVariable() const { return AbsVar; }
+  const MachineInstr *getMInsn()     const { return MInsn; }
+  void setMInsn(const MachineInstr *M)     { MInsn = M; }
+  int getFrameIndex()                const { return FrameIndex; }
+  void setFrameIndex(int FI)               { FrameIndex = FI; }
+  // Translate tag to proper Dwarf tag.  
+  unsigned getTag()                  const { 
+    if (Var.getTag() == dwarf::DW_TAG_arg_variable)
+      return dwarf::DW_TAG_formal_parameter;
+    
+    return dwarf::DW_TAG_variable;
+  }
+  /// isArtificial - Return true if DbgVariable is artificial.
+  bool isArtificial()                const {
+    if (Var.isArtificial())
+      return true;
+    if (Var.getTag() == dwarf::DW_TAG_arg_variable
+        && getType().isArtificial())
+      return true;
+    return false;
+  }
   bool variableHasComplexAddress()   const {
     assert(Var.Verify() && "Invalid complex DbgVariable!");
     return Var.hasComplexAddress();
@@ -167,8 +192,13 @@ class DwarfDebug {
   //
 
   CompileUnit *FirstCU;
+
+  /// Maps MDNode with its corresponding CompileUnit.
   DenseMap <const MDNode *, CompileUnit *> CUMap;
 
+  /// Maps subprogram MDNode with its corresponding CompileUnit.
+  DenseMap <const MDNode *, CompileUnit *> SPMap;
+
   /// AbbreviationsSet - Used to uniquely define abbreviations.
   ///
   FoldingSet<DIEAbbrev> AbbreviationsSet;
@@ -192,63 +222,27 @@ class DwarfDebug {
   ///
   UniqueVector<const MCSection*> SectionMap;
 
-  /// CurrentFnDbgScope - Top level scope for the current function.
-  ///
-  DbgScope *CurrentFnDbgScope;
-  
   /// CurrentFnArguments - List of Arguments (DbgValues) for current function.
   SmallVector<DbgVariable *, 8> CurrentFnArguments;
 
-  /// DbgScopeMap - Tracks the scopes in the current function.  Owns the
-  /// contained DbgScope*s.
-  DenseMap<const MDNode *, DbgScope *> DbgScopeMap;
-
-  /// InlinedDbgScopeMap - Tracks inlined function scopes in current function.
-  DenseMap<DebugLoc, DbgScope *> InlinedDbgScopeMap;
-
-  /// AbstractScopes - Tracks the abstract scopes a module. These scopes are
-  /// not included DbgScopeMap.  AbstractScopes owns its DbgScope*s.
-  DenseMap<const MDNode *, DbgScope *> AbstractScopes;
+  LexicalScopes LScopes;
 
   /// AbstractSPDies - Collection of abstract subprogram DIEs.
   DenseMap<const MDNode *, DIE *> AbstractSPDies;
 
-  /// AbstractScopesList - Tracks abstract scopes constructed while processing
-  /// a function. This list is cleared during endFunction().
-  SmallVector<DbgScope *, 4>AbstractScopesList;
+  /// ScopeVariables - Collection of dbg variables of a scope.
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> > ScopeVariables;
 
-  /// AbstractVariables - Collection on abstract variables.  Owned by the
-  /// DbgScopes in AbstractScopes.
+  /// AbstractVariables - Collection on abstract variables.
   DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
 
-  /// DbgVariableToFrameIndexMap - Tracks frame index used to find 
-  /// variable's value.
-  DenseMap<const DbgVariable *, int> DbgVariableToFrameIndexMap;
-
-  /// DbgVariableToDbgInstMap - Maps DbgVariable to corresponding DBG_VALUE
-  /// machine instruction.
-  DenseMap<const DbgVariable *, const MachineInstr *> DbgVariableToDbgInstMap;
-
   /// DotDebugLocEntries - Collection of DotDebugLocEntry.
   SmallVector<DotDebugLocEntry, 4> DotDebugLocEntries;
 
-  /// UseDotDebugLocEntry - DW_AT_location attributes for the DIEs in this set
-  /// idetifies corresponding .debug_loc entry offset.
-  SmallPtrSet<const DIE *, 4> UseDotDebugLocEntry;
-
-  /// VarToAbstractVarMap - Maps DbgVariable with corresponding Abstract
-  /// DbgVariable, if any.
-  DenseMap<const DbgVariable *, const DbgVariable *> VarToAbstractVarMap;
-
   /// InliendSubprogramDIEs - Collection of subprgram DIEs that are marked
   /// (at the end of the module) as DW_AT_inline.
   SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
 
-  /// ContainingTypeMap - This map is used to keep track of subprogram DIEs that
-  /// need DW_AT_containing_type attribute. This attribute points to a DIE that
-  /// corresponds to the MDNode mapped with the subprogram DIE.
-  DenseMap<DIE *, const MDNode *> ContainingTypeMap;
-
   /// InlineInfo - Keep track of inlined functions and their location.  This
   /// information is used to populate debug_inlined section.
   typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels;
@@ -316,10 +310,7 @@ private:
   ///
   void assignAbbrevNumber(DIEAbbrev &Abbrev);
 
-  /// getOrCreateDbgScope - Create DbgScope for the scope.
-  DbgScope *getOrCreateDbgScope(DebugLoc DL);
-
-  DbgScope *getOrCreateAbstractScope(const MDNode *N);
+  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
   /// findAbstractVariable - Find abstract variable associated with Var.
   DbgVariable *findAbstractVariable(DIVariable &Var, DebugLoc Loc);
@@ -328,22 +319,22 @@ private:
   /// attach appropriate DW_AT_low_pc and DW_AT_high_pc attributes.
   /// If there are global variables in this scope then create and insert
   /// DIEs for these variables.
-  DIE *updateSubprogramScopeDIE(const MDNode *SPNode);
+  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, const MDNode *SPNode);
 
   /// constructLexicalScope - Construct new DW_TAG_lexical_block 
   /// for this scope and attach DW_AT_low_pc/DW_AT_high_pc labels.
-  DIE *constructLexicalScopeDIE(DbgScope *Scope);
+  DIE *constructLexicalScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
 
   /// constructInlinedScopeDIE - This scope represents inlined body of
   /// a function. Construct DIE to represent this concrete inlined copy
   /// of the function.
-  DIE *constructInlinedScopeDIE(DbgScope *Scope);
+  DIE *constructInlinedScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
 
   /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, DbgScope *S);
+  DIE *constructVariableDIE(DbgVariable *DV, LexicalScope *S);
 
   /// constructScopeDIE - Construct a DIE for this scope.
-  DIE *constructScopeDIE(DbgScope *Scope);
+  DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
 
   /// EmitSectionLabels - Emit initial Dwarf sections with a label at
   /// the start of each one.
@@ -424,16 +415,10 @@ private:
 
   /// constructCompileUnit - Create new CompileUnit for the given 
   /// metadata node with tag DW_TAG_compile_unit.
-  void constructCompileUnit(const MDNode *N);
-
-  /// getCompielUnit - Get CompileUnit DIE.
-  CompileUnit *getCompileUnit(const MDNode *N) const;
-
-  /// constructGlobalVariableDIE - Construct global variable DIE.
-  void constructGlobalVariableDIE(const MDNode *N);
+  CompileUnit *constructCompileUnit(const MDNode *N);
 
   /// construct SubprogramDIE - Construct subprogram DIE.
-  void constructSubprogramDIE(const MDNode *N);
+  void constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N);
 
   /// recordSourceLine - Register a source line with debug info. Returns the
   /// unique label that was emitted and which provides correspondence to
@@ -441,30 +426,16 @@ private:
   void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
                         unsigned Flags);
   
-  /// recordVariableFrameIndex - Record a variable's index.
-  void recordVariableFrameIndex(const DbgVariable *V, int Index);
-
-  /// findVariableFrameIndex - Return true if frame index for the variable
-  /// is found. Update FI to hold value of the index.
-  bool findVariableFrameIndex(const DbgVariable *V, int *FI);
-
-  /// findDbgScope - Find DbgScope for the debug loc.
-  DbgScope *findDbgScope(DebugLoc DL);
-
   /// identifyScopeMarkers() - Indentify instructions that are marking
   /// beginning of or end of a scope.
   void identifyScopeMarkers();
 
-  /// extractScopeInformation - Scan machine instructions in this function
-  /// and collect DbgScopes. Return true, if atleast one scope was found.
-  bool extractScopeInformation();
-  
   /// addCurrentFnArgument - If Var is an current function argument that add
   /// it in CurrentFnArguments list.
   bool addCurrentFnArgument(const MachineFunction *MF,
-                            DbgVariable *Var, DbgScope *Scope);
+                            DbgVariable *Var, LexicalScope *Scope);
 
-  /// collectVariableInfo - Populate DbgScope entries with variables' info.
+  /// collectVariableInfo - Populate LexicalScope entries with variables' info.
   void collectVariableInfo(const MachineFunction *,
                            SmallPtrSet<const MDNode *, 16> &ProcessedVars);
   
@@ -496,6 +467,14 @@ public:
   DwarfDebug(AsmPrinter *A, Module *M);
   ~DwarfDebug();
 
+  /// collectInfoFromNamedMDNodes - Collect debug info from named mdnodes such
+  /// as llvm.dbg.enum and llvm.dbg.ty
+  void collectInfoFromNamedMDNodes(Module *M);
+
+  /// collectLegacyDebugInfo - Collect debug info using DebugInfoFinder.
+  /// FIXME - Remove this when dragon-egg and llvm-gcc switch to DIBuilder.
+  bool collectLegacyDebugInfo(Module *M);
+
   /// beginModule - Emit all Dwarf sections that should come prior to the
   /// content.
   void beginModule(Module *M);
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 1f992fa..18b726b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -527,29 +526,26 @@ void DwarfException::EmitExceptionTable() {
          I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
       const CallSiteEntry &S = *I;
 
+      // Offset of the landing pad, counted in 16-byte bundles relative to the
+      // @LPStart address.
       if (VerboseAsm) {
-        // Emit comments that decode the call site.
         Asm->OutStreamer.AddComment(Twine(">> Call Site ") +
                                     llvm::utostr(idx) + " <<");
         Asm->OutStreamer.AddComment(Twine("  On exception at call site ") +
                                     llvm::utostr(idx));
+      }
+      Asm->EmitULEB128(idx);
 
+      // Offset of the first associated action record, relative to the start of
+      // the action table. This value is biased by 1 (1 indicates the start of
+      // the action table), and 0 indicates that there are no actions.
+      if (VerboseAsm) {
         if (S.Action == 0)
           Asm->OutStreamer.AddComment("  Action: cleanup");
         else
           Asm->OutStreamer.AddComment(Twine("  Action: ") +
                                       llvm::utostr((S.Action - 1) / 2 + 1));
-
-        Asm->OutStreamer.AddBlankLine();
       }
-
-      // Offset of the landing pad, counted in 16-byte bundles relative to the
-      // @LPStart address.
-      Asm->EmitULEB128(idx);
-
-      // Offset of the first associated action record, relative to the start of
-      // the action table. This value is biased by 1 (1 indicates the start of
-      // the action table), and 0 indicates that there are no actions.
       Asm->EmitULEB128(S.Action);
     }
   } else {
@@ -595,46 +591,43 @@ void DwarfException::EmitExceptionTable() {
       if (EndLabel == 0)
         EndLabel = Asm->GetTempSymbol("eh_func_end", Asm->getFunctionNumber());
 
-      if (VerboseAsm) {
-        // Emit comments that decode the call site.
-        Asm->OutStreamer.AddComment(Twine(">> Call Site ") +
-                                    llvm::utostr(++Entry) + " <<");
-        Asm->OutStreamer.AddComment(Twine("  Call between ") +
-                                    BeginLabel->getName() + " and " +
-                                    EndLabel->getName());
-
-        if (!S.PadLabel) {
-          Asm->OutStreamer.AddComment("    has no landing pad");
-        } else {
-          Asm->OutStreamer.AddComment(Twine("    jumps to ") +
-                                      S.PadLabel->getName());
-
-          if (S.Action == 0)
-            Asm->OutStreamer.AddComment("  On action: cleanup");
-          else
-            Asm->OutStreamer.AddComment(Twine("  On action: ") +
-                                        llvm::utostr((S.Action - 1) / 2 + 1));
-        }
-
-        Asm->OutStreamer.AddBlankLine();
-      }
 
       // Offset of the call site relative to the previous call site, counted in
       // number of 16-byte bundles. The first call site is counted relative to
       // the start of the procedure fragment.
+      if (VerboseAsm)
+        Asm->OutStreamer.AddComment(Twine(">> Call Site ") +
+                                    llvm::utostr(++Entry) + " <<");
       Asm->EmitLabelDifference(BeginLabel, EHFuncBeginSym, 4);
+      if (VerboseAsm)
+        Asm->OutStreamer.AddComment(Twine("  Call between ") +
+                                    BeginLabel->getName() + " and " +
+                                    EndLabel->getName());
       Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
 
       // Offset of the landing pad, counted in 16-byte bundles relative to the
       // @LPStart address.
-      if (!S.PadLabel)
+      if (!S.PadLabel) {
+        if (VerboseAsm)
+          Asm->OutStreamer.AddComment("    has no landing pad");
         Asm->OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
-      else
+      } else {
+        if (VerboseAsm)
+          Asm->OutStreamer.AddComment(Twine("    jumps to ") +
+                                      S.PadLabel->getName());
         Asm->EmitLabelDifference(S.PadLabel, EHFuncBeginSym, 4);
+      }
 
       // Offset of the first associated action record, relative to the start of
       // the action table. This value is biased by 1 (1 indicates the start of
       // the action table), and 0 indicates that there are no actions.
+      if (VerboseAsm) {
+        if (S.Action == 0)
+          Asm->OutStreamer.AddComment("  On action: cleanup");
+        else
+          Asm->OutStreamer.AddComment(Twine("  On action: ") +
+                                      llvm::utostr((S.Action - 1) / 2 + 1));
+      }
       Asm->EmitULEB128(S.Action);
     }
   }
@@ -649,13 +642,29 @@ void DwarfException::EmitExceptionTable() {
       // Emit comments that decode the action table.
       Asm->OutStreamer.AddComment(Twine(">> Action Record ") +
                                   llvm::utostr(++Entry) + " <<");
-      if (Action.ValueForTypeID >= 0)
+    }
+
+    // Type Filter
+    //
+    //   Used by the runtime to match the type of the thrown exception to the
+    //   type of the catch clauses or the types in the exception specification.
+    if (VerboseAsm) {
+      if (Action.ValueForTypeID > 0)
         Asm->OutStreamer.AddComment(Twine("  Catch TypeInfo ") +
                                     llvm::itostr(Action.ValueForTypeID));
-      else 
+      else if (Action.ValueForTypeID < 0)
         Asm->OutStreamer.AddComment(Twine("  Filter TypeInfo ") +
                                     llvm::itostr(Action.ValueForTypeID));
+      else
+        Asm->OutStreamer.AddComment("  Cleanup");
+    }
+    Asm->EmitSLEB128(Action.ValueForTypeID);
 
+    // Action Record
+    //
+    //   Self-relative signed displacement in bytes of the next action record,
+    //   or 0 if there is no next action record.
+    if (VerboseAsm) {
       if (Action.NextAction == 0) {
         Asm->OutStreamer.AddComment("  No further actions");
       } else {
@@ -663,20 +672,7 @@ void DwarfException::EmitExceptionTable() {
         Asm->OutStreamer.AddComment(Twine("  Continue to action ") +
                                     llvm::utostr(NextAction));
       }
-
-      Asm->OutStreamer.AddBlankLine();
     }
-
-    // Type Filter
-    //
-    //   Used by the runtime to match the type of the thrown exception to the
-    //   type of the catch clauses or the types in the exception specification.
-    Asm->EmitSLEB128(Action.ValueForTypeID);
-
-    // Action Record
-    //
-    //   Self-relative signed displacement in bytes of the next action record,
-    //   or 0 if there is no next action record.
     Asm->EmitSLEB128(Action.NextAction);
   }
 
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index c2ad5eb..b83aa5a 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 99090a8..75288b0 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -1624,26 +1624,29 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     if (!TIB->isSafeToMove(TII, 0, DontMoveAcrossStore))
       break;
 
+    // Remove kills from LocalDefsSet, these registers had short live ranges.
+    for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = TIB->getOperand(i);
+      if (!MO.isReg() || !MO.isUse() || !MO.isKill())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || !LocalDefsSet.count(Reg))
+        continue;
+      for (const unsigned *OR = TRI->getOverlaps(Reg); *OR; ++OR)
+        LocalDefsSet.erase(*OR);
+    }
+
     // Track local defs so we can update liveins.
     for (unsigned i = 0, e = TIB->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = TIB->getOperand(i);
-      if (!MO.isReg())
+      if (!MO.isReg() || !MO.isDef() || MO.isDead())
         continue;
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
-      if (MO.isDef()) {
-        if (!MO.isDead()) {
-          LocalDefs.push_back(Reg);
-          LocalDefsSet.insert(Reg);
-          for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-            LocalDefsSet.insert(*SR);
-        }
-      } else if (MO.isKill() && LocalDefsSet.count(Reg)) {
-        LocalDefsSet.erase(Reg);
-        for (const unsigned *SR = TRI->getSubRegisters(Reg); *SR; ++SR)
-          LocalDefsSet.erase(*SR);
-      }
+      LocalDefs.push_back(Reg);
+      for (const unsigned *OR = TRI->getOverlaps(Reg); *OR; ++OR)
+        LocalDefsSet.insert(*OR);
     }
 
     HasDups = true;;
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 06d2a95..9a5e551 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -13,7 +13,9 @@ add_llvm_library(LLVMCodeGen
   EdgeBundles.cpp
   ELFCodeEmitter.cpp
   ELFWriter.cpp
+  ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
+  ExpandPostRAPseudos.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
   GCStrategy.cpp
@@ -23,17 +25,18 @@ add_llvm_library(LLVMCodeGen
   IntrinsicLowering.cpp
   LLVMTargetMachine.cpp
   LatencyPriorityQueue.cpp
+  LexicalScopes.cpp
   LiveDebugVariables.cpp
   LiveInterval.cpp
   LiveIntervalAnalysis.cpp
   LiveIntervalUnion.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
+  LiveRangeCalc.cpp
   LiveRangeEdit.cpp
   LocalStackSlotAllocation.cpp
-  LowerSubregs.cpp
   MachineBasicBlock.cpp
-  MachineBlockFrequency.cpp
+  MachineBlockFrequencyInfo.cpp
   MachineBranchProbabilityInfo.cpp
   MachineCSE.cpp
   MachineDominators.cpp
@@ -97,5 +100,15 @@ add_llvm_library(LLVMCodeGen
   VirtRegRewriter.cpp
   )
 
+add_llvm_library_dependencies(LLVMCodeGen
+  LLVMAnalysis
+  LLVMCore
+  LLVMMC
+  LLVMScalarOpts
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
+
 add_subdirectory(SelectionDAG)
 add_subdirectory(AsmPrinter)
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index e6b3bbc..ea16a25 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -185,35 +185,3 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
 
   li.weight = normalizeSpillWeight(totalWeight, li.getSize());
 }
-
-void VirtRegAuxInfo::CalculateRegClass(unsigned reg) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
-  const TargetRegisterClass *OldRC = MRI.getRegClass(reg);
-  const TargetRegisterClass *NewRC = TRI->getLargestLegalSuperClass(OldRC);
-
-  // Stop early if there is no room to grow.
-  if (NewRC == OldRC)
-    return;
-
-  // Accumulate constraints from all uses.
-  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI.reg_nodbg_begin(reg),
-       E = MRI.reg_nodbg_end(); I != E; ++I) {
-    // TRI doesn't have accurate enough information to model this yet.
-    if (I.getOperand().getSubReg())
-      return;
-    // Inline asm instuctions don't remember their constraints.
-    if (I->isInlineAsm())
-      return;
-    const TargetRegisterClass *OpRC =
-      TII->getRegClass(I->getDesc(), I.getOperandNo(), TRI);
-    if (OpRC)
-      NewRC = getCommonSubClass(NewRC, OpRC);
-    if (!NewRC || NewRC == OldRC)
-      return;
-  }
-  DEBUG(dbgs() << "Inflating " << OldRC->getName() << ':' << PrintReg(reg)
-               << " to " << NewRC->getName() <<".\n");
-  MRI.setRegClass(reg, NewRC);
-}
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 489746c..424535b 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -27,6 +27,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLiveIntervalsPass(Registry);
   initializeLiveStacksPass(Registry);
   initializeLiveVariablesPass(Registry);
+  initializeMachineBlockFrequencyInfoPass(Registry);
   initializeMachineCSEPass(Registry);
   initializeMachineDominatorTreePass(Registry);
   initializeMachineLICMPass(Registry);
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 03604b0..ed9e409 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -63,6 +63,8 @@ namespace {
     typedef SmallPtrSet<BasicBlock*, 8> BBSet;
     BBSet LandingPads;
 
+    bool InsertUnwindResumeCalls();
+
     bool NormalizeLandingPads();
     bool LowerUnwindsAndResumes();
     bool MoveExceptionValueCalls();
@@ -658,13 +660,76 @@ Instruction *DwarfEHPrepare::CreateExceptionValueCall(BasicBlock *BB) {
   return CallInst::Create(ExceptionValueIntrinsic, "eh.value.call", Start);
 }
 
+/// InsertUnwindResumeCalls - Convert the ResumeInsts that are still present
+/// into calls to the appropriate _Unwind_Resume function.
+bool DwarfEHPrepare::InsertUnwindResumeCalls() {
+  bool UsesNewEH = false;
+  SmallVector<ResumeInst*, 16> Resumes;
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    TerminatorInst *TI = I->getTerminator();
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(TI))
+      Resumes.push_back(RI);
+    else if (InvokeInst *II = dyn_cast<InvokeInst>(TI))
+      UsesNewEH = II->getUnwindDest()->isLandingPad();
+  }
+
+  if (Resumes.empty())
+    return UsesNewEH;
+
+  // Find the rewind function if we didn't already.
+  if (!RewindFunction) {
+    LLVMContext &Ctx = Resumes[0]->getContext();
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
+                                          Type::getInt8PtrTy(Ctx), false);
+    const char *RewindName = TLI->getLibcallName(RTLIB::UNWIND_RESUME);
+    RewindFunction = F->getParent()->getOrInsertFunction(RewindName, FTy);
+  }
+
+  // Create the basic block where the _Unwind_Resume call will live.
+  LLVMContext &Ctx = F->getContext();
+  BasicBlock *UnwindBB = BasicBlock::Create(Ctx, "unwind_resume", F);
+  PHINode *PN = PHINode::Create(Type::getInt8PtrTy(Ctx), Resumes.size(),
+                                "exn.obj", UnwindBB);
+
+  // Extract the exception object from the ResumeInst and add it to the PHI node
+  // that feeds the _Unwind_Resume call.
+  BasicBlock *UnwindBBDom = Resumes[0]->getParent();
+  for (SmallVectorImpl<ResumeInst*>::iterator
+         I = Resumes.begin(), E = Resumes.end(); I != E; ++I) {
+    ResumeInst *RI = *I;
+    BranchInst::Create(UnwindBB, RI->getParent());
+    ExtractValueInst *ExnObj = ExtractValueInst::Create(RI->getOperand(0),
+                                                        0, "exn.obj", RI);
+    PN->addIncoming(ExnObj, RI->getParent());
+    UnwindBBDom = DT->findNearestCommonDominator(RI->getParent(), UnwindBBDom);
+    RI->eraseFromParent();
+  }
+
+  // Call the function.
+  CallInst *CI = CallInst::Create(RewindFunction, PN, "", UnwindBB);
+  CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+
+  // We never expect _Unwind_Resume to return.
+  new UnreachableInst(Ctx, UnwindBB);
+
+  // Now update DominatorTree analysis information.
+  DT->addNewBlock(UnwindBB, UnwindBBDom);
+  return true;
+}
+
 bool DwarfEHPrepare::runOnFunction(Function &Fn) {
   bool Changed = false;
 
   // Initialize internal state.
-  DT = &getAnalysis<DominatorTree>();
+  DT = &getAnalysis<DominatorTree>(); // FIXME: We won't need this with the new EH.
   F = &Fn;
 
+  if (InsertUnwindResumeCalls()) {
+    // FIXME: The reset of this function can go once the new EH is done.
+    LandingPads.clear();
+    return true;
+  }
+
   // Ensure that only unwind edges end at landing pads (a landing pad is a
   // basic block where an invoke unwind edge ends).
   Changed |= NormalizeLandingPads();
diff --git a/lib/CodeGen/ELFCodeEmitter.cpp b/lib/CodeGen/ELFCodeEmitter.cpp
index 3fb087c..660424c3 100644
--- a/lib/CodeGen/ELFCodeEmitter.cpp
+++ b/lib/CodeGen/ELFCodeEmitter.cpp
@@ -155,7 +155,7 @@ void ELFCodeEmitter::emitConstantPool(MachineConstantPool *MCP) {
     CPSections.push_back(CstPool.SectionIdx);
 
     if (CPE.isMachineConstantPoolEntry())
-      assert("CPE.isMachineConstantPoolEntry not supported yet");
+      assert(0 && "CPE.isMachineConstantPoolEntry not supported yet");
 
     // Emit the constant to constant pool section
     EW.EmitGlobalConstant(CPE.Val.ConstVal, CstPool);
diff --git a/lib/CodeGen/ELFCodeEmitter.h b/lib/CodeGen/ELFCodeEmitter.h
index 2ec1f6e..8671c67 100644
--- a/lib/CodeGen/ELFCodeEmitter.h
+++ b/lib/CodeGen/ELFCodeEmitter.h
@@ -58,13 +58,13 @@ namespace llvm {
 
     /// emitLabel - Emits a label
     virtual void emitLabel(MCSymbol *Label) {
-      assert("emitLabel not implemented");
+      assert(0 && "emitLabel not implemented");
     }
 
     /// getLabelAddress - Return the address of the specified LabelID, 
     /// only usable after the LabelID has been emitted.
     virtual uintptr_t getLabelAddress(MCSymbol *Label) const {
-      assert("getLabelAddress not implemented");
+      assert(0 && "getLabelAddress not implemented");
       return 0;
     }
 
diff --git a/lib/CodeGen/ELFWriter.cpp b/lib/CodeGen/ELFWriter.cpp
index d977651..f2c2185 100644
--- a/lib/CodeGen/ELFWriter.cpp
+++ b/lib/CodeGen/ELFWriter.cpp
@@ -45,12 +45,12 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetELFWriterInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -65,7 +65,8 @@ char ELFWriter::ID = 0;
 
 ELFWriter::ELFWriter(raw_ostream &o, TargetMachine &tm)
   : MachineFunctionPass(ID), O(o), TM(tm),
-    OutContext(*new MCContext(*TM.getMCAsmInfo(), new TargetAsmInfo(tm))),
+    OutContext(*new MCContext(*TM.getMCAsmInfo(), *TM.getRegisterInfo(),
+                              &TM.getTargetLowering()->getObjFileLowering())),
     TLOF(TM.getTargetLowering()->getObjFileLowering()),
     is64Bit(TM.getTargetData()->getPointerSizeInBits() == 64),
     isLittleEndian(TM.getTargetData()->isLittleEndian()),
@@ -482,7 +483,7 @@ void ELFWriter::EmitGlobalConstant(const Constant *CV, ELFSection &GblS) {
       EmitGlobalConstantLargeInt(CI, GblS);
     return;
   } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
-    const VectorType *PTy = CP->getType();
+    VectorType *PTy = CP->getType();
     for (unsigned I = 0, E = PTy->getNumElements(); I < E; ++I)
       EmitGlobalConstant(CP->getOperand(I), GblS);
     return;
@@ -540,8 +541,7 @@ CstExprResTy ELFWriter::ResolveConstantExpr(const Constant *CV) {
   case Instruction::GetElementPtr: {
     const Constant *ptrVal = CE->getOperand(0);
     SmallVector<Value*, 8> idxVec(CE->op_begin()+1, CE->op_end());
-    int64_t Offset = TD->getIndexedOffset(ptrVal->getType(), &idxVec[0],
-                                          idxVec.size());
+    int64_t Offset = TD->getIndexedOffset(ptrVal->getType(), idxVec);
     return std::make_pair(ptrVal, Offset);
   }
   case Instruction::IntToPtr: {
@@ -552,7 +552,7 @@ CstExprResTy ELFWriter::ResolveConstantExpr(const Constant *CV) {
   }
   case Instruction::PtrToInt: {
     Constant *Op = CE->getOperand(0);
-    const Type *Ty = CE->getType();
+    Type *Ty = CE->getType();
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot greater or equal to the size of the pointer.
diff --git a/lib/Target/X86/SSEDomainFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 13680c5..01dccdb 100644
--- a/lib/Target/X86/SSEDomainFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -1,4 +1,4 @@
-//===- SSEDomainFix.cpp - Use proper int/float domain for SSE ---*- C++ -*-===//
+//===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,21 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the SSEDomainFix pass.
+// This file contains the execution dependency fix pass.
 //
-// Some SSE instructions like mov, and, or, xor are available in different
+// Some X86 SSE instructions like mov, and, or, xor are available in different
 // variants for different operand types. These variant instructions are
 // equivalent, but on Nehalem and newer cpus there is extra latency
-// transferring data between integer and floating point domains.
+// transferring data between integer and floating point domains.  ARM cores
+// have similar issues when they are configured with both VFP and NEON
+// pipelines.
 //
 // This pass changes the variant instructions to minimize domain crossings.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sse-domain-fix"
-#include "X86InstrInfo.h"
+#define DEBUG_TYPE "execution-fix"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
@@ -97,25 +101,27 @@ struct DomainValue {
 };
 }
 
-static const unsigned NumRegs = 16;
-
 namespace {
-class SSEDomainFixPass : public MachineFunctionPass {
+class ExeDepsFix : public MachineFunctionPass {
   static char ID;
   SpecificBumpPtrAllocator<DomainValue> Allocator;
   SmallVector<DomainValue*,16> Avail;
 
+  const TargetRegisterClass *const RC;
   MachineFunction *MF;
-  const X86InstrInfo *TII;
+  const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   MachineBasicBlock *MBB;
+  std::vector<int> AliasMap;
+  const unsigned NumRegs;
   DomainValue **LiveRegs;
   typedef DenseMap<MachineBasicBlock*,DomainValue**> LiveOutMap;
   LiveOutMap LiveOuts;
   unsigned Distance;
 
 public:
-  SSEDomainFixPass() : MachineFunctionPass(ID) {}
+  ExeDepsFix(const TargetRegisterClass *rc)
+    : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     AU.setPreservesAll();
@@ -150,17 +156,16 @@ private:
 };
 }
 
-char SSEDomainFixPass::ID = 0;
+char ExeDepsFix::ID = 0;
 
 /// Translate TRI register number to an index into our smaller tables of
 /// interesting registers. Return -1 for boring registers.
-int SSEDomainFixPass::RegIndex(unsigned reg) {
-  assert(X86::XMM15 == X86::XMM0+NumRegs-1 && "Unexpected sort");
-  reg -= X86::XMM0;
-  return reg < NumRegs ? (int) reg : -1;
+int ExeDepsFix::RegIndex(unsigned Reg) {
+  assert(Reg < AliasMap.size() && "Invalid register");
+  return AliasMap[Reg];
 }
 
-DomainValue *SSEDomainFixPass::Alloc(int domain) {
+DomainValue *ExeDepsFix::Alloc(int domain) {
   DomainValue *dv = Avail.empty() ?
                       new(Allocator.Allocate()) DomainValue :
                       Avail.pop_back_val();
@@ -170,14 +175,14 @@ DomainValue *SSEDomainFixPass::Alloc(int domain) {
   return dv;
 }
 
-void SSEDomainFixPass::Recycle(DomainValue *dv) {
+void ExeDepsFix::Recycle(DomainValue *dv) {
   assert(dv && "Cannot recycle NULL");
   dv->clear();
   Avail.push_back(dv);
 }
 
 /// Set LiveRegs[rx] = dv, updating reference counts.
-void SSEDomainFixPass::SetLiveReg(int rx, DomainValue *dv) {
+void ExeDepsFix::SetLiveReg(int rx, DomainValue *dv) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   if (!LiveRegs) {
     LiveRegs = new DomainValue*[NumRegs];
@@ -195,7 +200,7 @@ void SSEDomainFixPass::SetLiveReg(int rx, DomainValue *dv) {
 }
 
 // Kill register rx, recycle or collapse any DomainValue.
-void SSEDomainFixPass::Kill(int rx) {
+void ExeDepsFix::Kill(int rx) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   if (!LiveRegs || !LiveRegs[rx]) return;
 
@@ -208,7 +213,7 @@ void SSEDomainFixPass::Kill(int rx) {
 }
 
 /// Force register rx into domain.
-void SSEDomainFixPass::Force(int rx, unsigned domain) {
+void ExeDepsFix::Force(int rx, unsigned domain) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   DomainValue *dv;
   if (LiveRegs && (dv = LiveRegs[rx])) {
@@ -217,8 +222,8 @@ void SSEDomainFixPass::Force(int rx, unsigned domain) {
     else if (dv->hasDomain(domain))
       Collapse(dv, domain);
     else {
-      // This is an incompatible open DomainValue. Collapse it to whatever and force
-      // the new value into domain. This costs a domain crossing.
+      // This is an incompatible open DomainValue. Collapse it to whatever and
+      // force the new value into domain. This costs a domain crossing.
       Collapse(dv, dv->getFirstDomain());
       assert(LiveRegs[rx] && "Not live after collapse?");
       LiveRegs[rx]->addDomain(domain);
@@ -231,12 +236,12 @@ void SSEDomainFixPass::Force(int rx, unsigned domain) {
 
 /// Collapse open DomainValue into given domain. If there are multiple
 /// registers using dv, they each get a unique collapsed DomainValue.
-void SSEDomainFixPass::Collapse(DomainValue *dv, unsigned domain) {
+void ExeDepsFix::Collapse(DomainValue *dv, unsigned domain) {
   assert(dv->hasDomain(domain) && "Cannot collapse");
 
   // Collapse all the instructions.
   while (!dv->Instrs.empty())
-    TII->SetSSEDomain(dv->Instrs.pop_back_val(), domain);
+    TII->setExecutionDomain(dv->Instrs.pop_back_val(), domain);
   dv->setSingleDomain(domain);
 
   // If there are multiple users, give them new, unique DomainValues.
@@ -248,7 +253,7 @@ void SSEDomainFixPass::Collapse(DomainValue *dv, unsigned domain) {
 
 /// Merge - All instructions and registers in B are moved to A, and B is
 /// released.
-bool SSEDomainFixPass::Merge(DomainValue *A, DomainValue *B) {
+bool ExeDepsFix::Merge(DomainValue *A, DomainValue *B) {
   assert(!A->isCollapsed() && "Cannot merge into collapsed");
   assert(!B->isCollapsed() && "Cannot merge from collapsed");
   if (A == B)
@@ -266,7 +271,7 @@ bool SSEDomainFixPass::Merge(DomainValue *A, DomainValue *B) {
   return true;
 }
 
-void SSEDomainFixPass::enterBasicBlock() {
+void ExeDepsFix::enterBasicBlock() {
   // Try to coalesce live-out registers from predecessors.
   for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
          e = MBB->livein_end(); i != e; ++i) {
@@ -303,7 +308,7 @@ void SSEDomainFixPass::enterBasicBlock() {
 
 // A hard instruction only works in one domain. All input registers will be
 // forced into that domain.
-void SSEDomainFixPass::visitHardInstr(MachineInstr *mi, unsigned domain) {
+void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
   // Collapse all uses.
   for (unsigned i = mi->getDesc().getNumDefs(),
                 e = mi->getDesc().getNumOperands(); i != e; ++i) {
@@ -326,7 +331,7 @@ void SSEDomainFixPass::visitHardInstr(MachineInstr *mi, unsigned domain) {
 }
 
 // A soft instruction can be changed to work in other domains given by mask.
-void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   // Bitmask of available domains for this instruction after taking collapsed
   // operands into account.
   unsigned available = mask;
@@ -362,7 +367,7 @@ void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   // If the collapsed operands force a single domain, propagate the collapse.
   if (isPowerOf2_32(available)) {
     unsigned domain = CountTrailingZeros_32(available);
-    TII->SetSSEDomain(mi, domain);
+    TII->setExecutionDomain(mi, domain);
     visitHardInstr(mi, domain);
     return;
   }
@@ -431,8 +436,8 @@ void SSEDomainFixPass::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   }
 }
 
-void SSEDomainFixPass::visitGenericInstr(MachineInstr *mi) {
-  // Process explicit defs, kill any XMM registers redefined.
+void ExeDepsFix::visitGenericInstr(MachineInstr *mi) {
+  // Process explicit defs, kill any relevant registers redefined.
   for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
     MachineOperand &mo = mi->getOperand(i);
     if (!mo.isReg()) continue;
@@ -442,25 +447,36 @@ void SSEDomainFixPass::visitGenericInstr(MachineInstr *mi) {
   }
 }
 
-bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) {
+bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TII = static_cast<const X86InstrInfo*>(MF->getTarget().getInstrInfo());
+  TII = MF->getTarget().getInstrInfo();
   TRI = MF->getTarget().getRegisterInfo();
   MBB = 0;
   LiveRegs = 0;
   Distance = 0;
-  assert(NumRegs == X86::VR128RegClass.getNumRegs() && "Bad regclass");
+  assert(NumRegs == RC->getNumRegs() && "Bad regclass");
 
-  // If no XMM registers are used in the function, we can skip it completely.
+  // If no relevant registers are used in the function, we can skip it
+  // completely.
   bool anyregs = false;
-  for (TargetRegisterClass::const_iterator I = X86::VR128RegClass.begin(),
-         E = X86::VR128RegClass.end(); I != E; ++I)
+  for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
+       I != E; ++I)
     if (MF->getRegInfo().isPhysRegUsed(*I)) {
       anyregs = true;
       break;
     }
   if (!anyregs) return false;
 
+  // Initialize the AliasMap on the first use.
+  if (AliasMap.empty()) {
+    // Given a PhysReg, AliasMap[PhysReg] is either the relevant index into RC,
+    // or -1.
+    AliasMap.resize(TRI->getNumRegs(), -1);
+    for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
+      for (const unsigned *AI = TRI->getOverlaps(RC->getRegister(i)); *AI; ++AI)
+        AliasMap[*AI] = i;
+  }
+
   MachineBasicBlock *Entry = MF->begin();
   SmallPtrSet<MachineBasicBlock*, 16> Visited;
   for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 16> >
@@ -473,7 +489,7 @@ bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) {
       MachineInstr *mi = I;
       if (mi->isDebugValue()) continue;
       ++Distance;
-      std::pair<uint16_t, uint16_t> domp = TII->GetSSEDomain(mi);
+      std::pair<uint16_t, uint16_t> domp = TII->getExecutionDomain(mi);
       if (domp.first)
         if (domp.second)
           visitSoftInstr(mi, domp.second);
@@ -501,6 +517,7 @@ bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) {
   return false;
 }
 
-FunctionPass *llvm::createSSEDomainFixPass() {
-  return new SSEDomainFixPass();
+FunctionPass *
+llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) {
+  return new ExeDepsFix(RC);
 }
diff --git a/lib/CodeGen/LowerSubregs.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 7871ba9..e2a14a8 100644
--- a/lib/CodeGen/LowerSubregs.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -1,4 +1,4 @@
-//===-- LowerSubregs.cpp - Subregister Lowering instruction pass ----------===//
+//===-- ExpandPostRAPseudos.cpp - Pseudo instruction expansion pass -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines a MachineFunction pass which runs after register
-// allocation that turns subreg insert/extract instructions into register
-// copies, as needed. This ensures correct codegen even if the coalescer
-// isn't able to remove all subreg instructions.
+// This file defines a pass that expands COPY and SUBREG_TO_REG pseudo
+// instructions after register allocation.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "lowersubregs"
+#define DEBUG_TYPE "postrapseudos"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -29,52 +27,51 @@
 using namespace llvm;
 
 namespace {
-  struct LowerSubregsInstructionPass : public MachineFunctionPass {
-  private:
-    const TargetRegisterInfo *TRI;
-    const TargetInstrInfo *TII;
+struct ExpandPostRA : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    LowerSubregsInstructionPass() : MachineFunctionPass(ID) {}
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ExpandPostRA() : MachineFunctionPass(ID) {}
 
-    const char *getPassName() const {
-      return "Subregister lowering instruction pass";
-    }
+  const char *getPassName() const {
+    return "Post-RA pseudo instruction expansion pass";
+  }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addPreservedID(MachineLoopInfoID);
-      AU.addPreservedID(MachineDominatorsID);
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
 
-    /// runOnMachineFunction - pass entry point
-    bool runOnMachineFunction(MachineFunction&);
+  /// runOnMachineFunction - pass entry point
+  bool runOnMachineFunction(MachineFunction&);
 
-  private:
-    bool LowerSubregToReg(MachineInstr *MI);
-    bool LowerCopy(MachineInstr *MI);
+private:
+  bool LowerSubregToReg(MachineInstr *MI);
+  bool LowerCopy(MachineInstr *MI);
 
-    void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
-                          const TargetRegisterInfo *TRI);
-    void TransferImplicitDefs(MachineInstr *MI);
-  };
+  void TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
+                        const TargetRegisterInfo *TRI);
+  void TransferImplicitDefs(MachineInstr *MI);
+};
+} // end anonymous namespace
 
-  char LowerSubregsInstructionPass::ID = 0;
-}
+char ExpandPostRA::ID = 0;
 
-FunctionPass *llvm::createLowerSubregsPass() {
-  return new LowerSubregsInstructionPass();
+FunctionPass *llvm::createExpandPostRAPseudosPass() {
+  return new ExpandPostRA();
 }
 
 /// TransferDeadFlag - MI is a pseudo-instruction with DstReg dead,
 /// and the lowered replacement instructions immediately precede it.
 /// Mark the replacement instructions with the dead flag.
 void
-LowerSubregsInstructionPass::TransferDeadFlag(MachineInstr *MI,
-                                              unsigned DstReg,
-                                              const TargetRegisterInfo *TRI) {
+ExpandPostRA::TransferDeadFlag(MachineInstr *MI, unsigned DstReg,
+                               const TargetRegisterInfo *TRI) {
   for (MachineBasicBlock::iterator MII =
         prior(MachineBasicBlock::iterator(MI)); ; --MII) {
     if (MII->addRegisterDead(DstReg, TRI))
@@ -88,7 +85,7 @@ LowerSubregsInstructionPass::TransferDeadFlag(MachineInstr *MI,
 /// replacement instructions immediately precede it.  Copy any implicit-def
 /// operands from MI to the replacement instruction.
 void
-LowerSubregsInstructionPass::TransferImplicitDefs(MachineInstr *MI) {
+ExpandPostRA::TransferImplicitDefs(MachineInstr *MI) {
   MachineBasicBlock::iterator CopyMI = MI;
   --CopyMI;
 
@@ -100,7 +97,7 @@ LowerSubregsInstructionPass::TransferImplicitDefs(MachineInstr *MI) {
   }
 }
 
-bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
+bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
   MachineBasicBlock *MBB = MI->getParent();
   assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
          MI->getOperand(1).isImm() &&
@@ -152,7 +149,7 @@ bool LowerSubregsInstructionPass::LowerSubregToReg(MachineInstr *MI) {
   return true;
 }
 
-bool LowerSubregsInstructionPass::LowerCopy(MachineInstr *MI) {
+bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
   MachineOperand &DstMO = MI->getOperand(0);
   MachineOperand &SrcMO = MI->getOperand(1);
 
@@ -191,9 +188,9 @@ bool LowerSubregsInstructionPass::LowerCopy(MachineInstr *MI) {
 /// runOnMachineFunction - Reduce subregister inserts and extracts to register
 /// copies.
 ///
-bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
+bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Machine Function\n"
-               << "********** LOWERING SUBREG INSTRS **********\n"
+               << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
                << "********** Function: "
                << MF.getFunction()->getName() << '\n');
   TRI = MF.getTarget().getRegisterInfo();
@@ -205,17 +202,34 @@ bool LowerSubregsInstructionPass::runOnMachineFunction(MachineFunction &MF) {
        mbbi != mbbe; ++mbbi) {
     for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
          mi != me;) {
-      MachineBasicBlock::iterator nmi = llvm::next(mi);
       MachineInstr *MI = mi;
-      assert(!MI->isInsertSubreg() && "INSERT_SUBREG should no longer appear");
-      assert(MI->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
-             "EXTRACT_SUBREG should no longer appear");
-      if (MI->isSubregToReg()) {
+      // Advance iterator here because MI may be erased.
+      ++mi;
+
+      // Only expand pseudos.
+      if (!MI->getDesc().isPseudo())
+        continue;
+
+      // Give targets a chance to expand even standard pseudos.
+      if (TII->expandPostRAPseudo(MI)) {
+        MadeChange = true;
+        continue;
+      }
+
+      // Expand standard pseudos.
+      switch (MI->getOpcode()) {
+      case TargetOpcode::SUBREG_TO_REG:
         MadeChange |= LowerSubregToReg(MI);
-      } else if (MI->isCopy()) {
+        break;
+      case TargetOpcode::COPY:
         MadeChange |= LowerCopy(MI);
+        break;
+      case TargetOpcode::DBG_VALUE:
+        continue;
+      case TargetOpcode::INSERT_SUBREG:
+      case TargetOpcode::EXTRACT_SUBREG:
+        llvm_unreachable("Sub-register pseudos should have been eliminated.");
       }
-      mi = nmi;
     }
   }
 
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 6cb2277..ce7ed29 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -16,14 +16,13 @@
 #include "llvm/Function.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -154,7 +153,8 @@ namespace {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const InstrItineraryData *InstrItins;
-    const MachineLoopInfo *MLI;
+    const MachineBranchProbabilityInfo *MBPI;
+
     bool MadeChange;
     int FnNum;
   public:
@@ -162,9 +162,9 @@ namespace {
     IfConverter() : MachineFunctionPass(ID), FnNum(-1) {
       initializeIfConverterPass(*PassRegistry::getPassRegistry());
     }
-    
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -252,7 +252,7 @@ namespace {
 }
 
 INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
 
 FunctionPass *llvm::createIfConverterPass() { return new IfConverter(); }
@@ -261,7 +261,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TLI = MF.getTarget().getTargetLowering();
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
-  MLI = &getAnalysis<MachineLoopInfo>();
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   InstrItins = MF.getTarget().getInstrItineraryData();
   if (!TII) return false;
 
@@ -790,28 +790,9 @@ IfConverter::BBInfo &IfConverter::AnalyzeBlock(MachineBasicBlock *BB,
   bool TNeedSub = TrueBBI.Predicate.size() > 0;
   bool FNeedSub = FalseBBI.Predicate.size() > 0;
   bool Enqueued = false;
-  
-  // Try to predict the branch, using loop info to guide us.
-  // General heuristics are:
-  //   - backedge -> 90% taken
-  //   - early exit -> 20% taken
-  //   - branch predictor confidence -> 90%
-  BranchProbability Prediction(5, 10);
-  MachineLoop *Loop = MLI->getLoopFor(BB);
-  if (Loop) {
-    if (TrueBBI.BB == Loop->getHeader())
-      Prediction = BranchProbability(9, 10);
-    else if (FalseBBI.BB == Loop->getHeader())
-      Prediction = BranchProbability(1, 10);
-
-    MachineLoop *TrueLoop = MLI->getLoopFor(TrueBBI.BB);
-    MachineLoop *FalseLoop = MLI->getLoopFor(FalseBBI.BB);
-    if (!TrueLoop || TrueLoop->getParentLoop() == Loop)
-      Prediction = BranchProbability(2, 10);
-    else if (!FalseLoop || FalseLoop->getParentLoop() == Loop)
-      Prediction = BranchProbability(8, 10);
-  }
-  
+
+  BranchProbability Prediction = MBPI->getEdgeProbability(BB, TrueBBI.BB);
+
   if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) &&
       MeetIfcvtSizeLimit(*TrueBBI.BB, (TrueBBI.NonPredSize - (Dups + Dups2) +
                                        TrueBBI.ExtraCost), TrueBBI.ExtraCost2,
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 5547f73..726af46 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -17,6 +17,7 @@
 #include "LiveRangeEdit.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
@@ -27,22 +28,26 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 STATISTIC(NumSpilledRanges,   "Number of spilled live ranges");
-STATISTIC(NumSnippets,        "Number of snippets included in spills");
+STATISTIC(NumSnippets,        "Number of spilled snippets");
 STATISTIC(NumSpills,          "Number of spills inserted");
+STATISTIC(NumSpillsRemoved,   "Number of spills removed");
 STATISTIC(NumReloads,         "Number of reloads inserted");
+STATISTIC(NumReloadsRemoved,  "Number of reloads removed");
 STATISTIC(NumFolded,          "Number of folded stack accesses");
 STATISTIC(NumFoldedLoads,     "Number of folded loads");
 STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
-STATISTIC(NumOmitReloadSpill, "Number of omitted spills after reloads");
-STATISTIC(NumHoistLocal,      "Number of locally hoisted spills");
-STATISTIC(NumHoistGlobal,     "Number of globally hoisted spills");
-STATISTIC(NumRedundantSpills, "Number of redundant spills identified");
+STATISTIC(NumOmitReloadSpill, "Number of omitted spills of reloads");
+STATISTIC(NumHoists,          "Number of hoisted spills");
+
+static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
+                                     cl::desc("Disable inline spill hoisting"));
 
 namespace {
 class InlineSpiller : public Spiller {
@@ -75,26 +80,49 @@ class InlineSpiller : public Spiller {
   // Values that failed to remat at some point.
   SmallPtrSet<VNInfo*, 8> UsedValues;
 
+public:
   // Information about a value that was defined by a copy from a sibling
   // register.
   struct SibValueInfo {
     // True when all reaching defs were reloads: No spill is necessary.
     bool AllDefsAreReloads;
 
+    // True when value is defined by an original PHI not from splitting.
+    bool DefByOrigPHI;
+
+    // True when the COPY defining this value killed its source.
+    bool KillsSource;
+
     // The preferred register to spill.
     unsigned SpillReg;
 
     // The value of SpillReg that should be spilled.
     VNInfo *SpillVNI;
 
+    // The block where SpillVNI should be spilled. Currently, this must be the
+    // block containing SpillVNI->def.
+    MachineBasicBlock *SpillMBB;
+
     // A defining instruction that is not a sibling copy or a reload, or NULL.
     // This can be used as a template for rematerialization.
     MachineInstr *DefMI;
 
+    // List of values that depend on this one.  These values are actually the
+    // same, but live range splitting has placed them in different registers,
+    // or SSA update needed to insert PHI-defs to preserve SSA form.  This is
+    // copies of the current value and phi-kills.  Usually only phi-kills cause
+    // more than one dependent value.
+    TinyPtrVector<VNInfo*> Deps;
+
     SibValueInfo(unsigned Reg, VNInfo *VNI)
-      : AllDefsAreReloads(false), SpillReg(Reg), SpillVNI(VNI), DefMI(0) {}
+      : AllDefsAreReloads(true), DefByOrigPHI(false), KillsSource(false),
+        SpillReg(Reg), SpillVNI(VNI), SpillMBB(0), DefMI(0) {}
+
+    // Returns true when a def has been found.
+    bool hasDef() const { return DefByOrigPHI || DefMI; }
   };
 
+private:
   // Values in RegsToSpill defined by sibling copies.
   typedef DenseMap<VNInfo*, SibValueInfo> SibValueMap;
   SibValueMap SibValues;
@@ -134,6 +162,7 @@ private:
 
   bool isSibling(unsigned Reg);
   MachineInstr *traceSiblingValue(unsigned, VNInfo*, VNInfo*);
+  void propagateSiblingValue(SibValueMap::iterator, VNInfo *VNI = 0);
   void analyzeSiblingValues();
 
   bool hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI);
@@ -282,6 +311,156 @@ bool InlineSpiller::isSibling(unsigned Reg) {
            VRM.getOriginal(Reg) == Original;
 }
 
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS,
+                               const InlineSpiller::SibValueInfo &SVI) {
+  OS << "spill " << PrintReg(SVI.SpillReg) << ':'
+     << SVI.SpillVNI->id << '@' << SVI.SpillVNI->def;
+  if (SVI.SpillMBB)
+    OS << " in BB#" << SVI.SpillMBB->getNumber();
+  if (SVI.AllDefsAreReloads)
+    OS << " all-reloads";
+  if (SVI.DefByOrigPHI)
+    OS << " orig-phi";
+  if (SVI.KillsSource)
+    OS << " kill";
+  OS << " deps[";
+  for (unsigned i = 0, e = SVI.Deps.size(); i != e; ++i)
+    OS << ' ' << SVI.Deps[i]->id << '@' << SVI.Deps[i]->def;
+  OS << " ]";
+  if (SVI.DefMI)
+    OS << " def: " << *SVI.DefMI;
+  else
+    OS << '\n';
+  return OS;
+}
+#endif
+
+/// propagateSiblingValue - Propagate the value in SVI to dependents if it is
+/// known.  Otherwise remember the dependency for later.
+///
+/// @param SVI SibValues entry to propagate.
+/// @param VNI Dependent value, or NULL to propagate to all saved dependents.
+void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVI,
+                                          VNInfo *VNI) {
+  // When VNI is non-NULL, add it to SVI's deps, and only propagate to that.
+  TinyPtrVector<VNInfo*> FirstDeps;
+  if (VNI) {
+    FirstDeps.push_back(VNI);
+    SVI->second.Deps.push_back(VNI);
+  }
+
+  // Has the value been completely determined yet?  If not, defer propagation.
+  if (!SVI->second.hasDef())
+    return;
+
+  // Work list of values to propagate.  It would be nice to use a SetVector
+  // here, but then we would be forced to use a SmallSet.
+  SmallVector<SibValueMap::iterator, 8> WorkList(1, SVI);
+  SmallPtrSet<VNInfo*, 8> WorkSet;
+
+  do {
+    SVI = WorkList.pop_back_val();
+    WorkSet.erase(SVI->first);
+    TinyPtrVector<VNInfo*> *Deps = VNI ? &FirstDeps : &SVI->second.Deps;
+    VNI = 0;
+
+    SibValueInfo &SV = SVI->second;
+    if (!SV.SpillMBB)
+      SV.SpillMBB = LIS.getMBBFromIndex(SV.SpillVNI->def);
+
+    DEBUG(dbgs() << "  prop to " << Deps->size() << ": "
+                 << SVI->first->id << '@' << SVI->first->def << ":\t" << SV);
+
+    assert(SV.hasDef() && "Propagating undefined value");
+
+    // Should this value be propagated as a preferred spill candidate?  We don't
+    // propagate values of registers that are about to spill.
+    bool PropSpill = !DisableHoisting && !isRegToSpill(SV.SpillReg);
+    unsigned SpillDepth = ~0u;
+
+    for (TinyPtrVector<VNInfo*>::iterator DepI = Deps->begin(),
+         DepE = Deps->end(); DepI != DepE; ++DepI) {
+      SibValueMap::iterator DepSVI = SibValues.find(*DepI);
+      assert(DepSVI != SibValues.end() && "Dependent value not in SibValues");
+      SibValueInfo &DepSV = DepSVI->second;
+      if (!DepSV.SpillMBB)
+        DepSV.SpillMBB = LIS.getMBBFromIndex(DepSV.SpillVNI->def);
+
+      bool Changed = false;
+
+      // Propagate defining instruction.
+      if (!DepSV.hasDef()) {
+        Changed = true;
+        DepSV.DefMI = SV.DefMI;
+        DepSV.DefByOrigPHI = SV.DefByOrigPHI;
+      }
+
+      // Propagate AllDefsAreReloads.  For PHI values, this computes an AND of
+      // all predecessors.
+      if (!SV.AllDefsAreReloads && DepSV.AllDefsAreReloads) {
+        Changed = true;
+        DepSV.AllDefsAreReloads = false;
+      }
+
+      // Propagate best spill value.
+      if (PropSpill && SV.SpillVNI != DepSV.SpillVNI) {
+        if (SV.SpillMBB == DepSV.SpillMBB) {
+          // DepSV is in the same block.  Hoist when dominated.
+          if (DepSV.KillsSource && SV.SpillVNI->def < DepSV.SpillVNI->def) {
+            // This is an alternative def earlier in the same MBB.
+            // Hoist the spill as far as possible in SpillMBB. This can ease
+            // register pressure:
+            //
+            //   x = def
+            //   y = use x
+            //   s = copy x
+            //
+            // Hoisting the spill of s to immediately after the def removes the
+            // interference between x and y:
+            //
+            //   x = def
+            //   spill x
+            //   y = use x<kill>
+            //
+            // This hoist only helps when the DepSV copy kills its source.
+            Changed = true;
+            DepSV.SpillReg = SV.SpillReg;
+            DepSV.SpillVNI = SV.SpillVNI;
+            DepSV.SpillMBB = SV.SpillMBB;
+          }
+        } else {
+          // DepSV is in a different block.
+          if (SpillDepth == ~0u)
+            SpillDepth = Loops.getLoopDepth(SV.SpillMBB);
+
+          // Also hoist spills to blocks with smaller loop depth, but make sure
+          // that the new value dominates.  Non-phi dependents are always
+          // dominated, phis need checking.
+          if ((Loops.getLoopDepth(DepSV.SpillMBB) > SpillDepth) &&
+              (!DepSVI->first->isPHIDef() ||
+               MDT.dominates(SV.SpillMBB, DepSV.SpillMBB))) {
+            Changed = true;
+            DepSV.SpillReg = SV.SpillReg;
+            DepSV.SpillVNI = SV.SpillVNI;
+            DepSV.SpillMBB = SV.SpillMBB;
+          }
+        }
+      }
+
+      if (!Changed)
+        continue;
+
+      // Something changed in DepSVI. Propagate to dependents.
+      if (WorkSet.insert(DepSVI->first))
+        WorkList.push_back(DepSVI);
+
+      DEBUG(dbgs() << "  update " << DepSVI->first->id << '@'
+            << DepSVI->first->def << " to:\t" << DepSV);
+    }
+  } while (!WorkList.empty());
+}
+
 /// traceSiblingValue - Trace a value that is about to be spilled back to the
 /// real defining instructions by looking through sibling copies. Always stay
 /// within the range of OrigVNI so the registers are known to carry the same
@@ -294,84 +473,101 @@ bool InlineSpiller::isSibling(unsigned Reg) {
 ///
 MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
                                                VNInfo *OrigVNI) {
+  // Check if a cached value already exists.
+  SibValueMap::iterator SVI;
+  bool Inserted;
+  tie(SVI, Inserted) =
+    SibValues.insert(std::make_pair(UseVNI, SibValueInfo(UseReg, UseVNI)));
+  if (!Inserted) {
+    DEBUG(dbgs() << "Cached value " << PrintReg(UseReg) << ':'
+                 << UseVNI->id << '@' << UseVNI->def << ' ' << SVI->second);
+    return SVI->second.DefMI;
+  }
+
   DEBUG(dbgs() << "Tracing value " << PrintReg(UseReg) << ':'
                << UseVNI->id << '@' << UseVNI->def << '\n');
-  SmallPtrSet<VNInfo*, 8> Visited;
+
+  // List of (Reg, VNI) that have been inserted into SibValues, but need to be
+  // processed.
   SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
   WorkList.push_back(std::make_pair(UseReg, UseVNI));
 
-  // Best spill candidate seen so far. This must dominate UseVNI.
-  SibValueInfo SVI(UseReg, UseVNI);
-  MachineBasicBlock *UseMBB = LIS.getMBBFromIndex(UseVNI->def);
-  MachineBasicBlock *SpillMBB = UseMBB;
-  unsigned SpillDepth = Loops.getLoopDepth(SpillMBB);
-  bool SeenOrigPHI = false; // Original PHI met.
-
   do {
     unsigned Reg;
     VNInfo *VNI;
     tie(Reg, VNI) = WorkList.pop_back_val();
-    if (!Visited.insert(VNI))
-      continue;
+    DEBUG(dbgs() << "  " << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def
+                 << ":\t");
 
-    // Is this value a better spill candidate?
-    if (!isRegToSpill(Reg)) {
-      MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
-      if (MBB == SpillMBB) {
-        // This is an alternative def earlier in the same MBB.
-        // Hoist the spill as far as possible in SpillMBB. This can ease
-        // register pressure:
-        //
-        //   x = def
-        //   y = use x
-        //   s = copy x
-        //
-        // Hoisting the spill of s to immediately after the def removes the
-        // interference between x and y:
-        //
-        //   x = def
-        //   spill x
-        //   y = use x<kill>
-        //
-        if (VNI->def < SVI.SpillVNI->def) {
-          DEBUG(dbgs() << "  hoist in BB#" << MBB->getNumber() << ": "
-                       << PrintReg(Reg) << ':' << VNI->id << '@' << VNI->def
-                       << '\n');
-          SVI.SpillReg = Reg;
-          SVI.SpillVNI = VNI;
-        }
-      } else if (MBB != UseMBB && MDT.dominates(MBB, UseMBB)) {
-        // This is a valid spill location dominating UseVNI.
-        // Prefer to spill at a smaller loop depth.
-        unsigned Depth = Loops.getLoopDepth(MBB);
-        if (Depth < SpillDepth) {
-          DEBUG(dbgs() << "  spill depth " << Depth << ": " << PrintReg(Reg)
-                       << ':' << VNI->id << '@' << VNI->def << '\n');
-          SVI.SpillReg = Reg;
-          SVI.SpillVNI = VNI;
-          SpillMBB = MBB;
-          SpillDepth = Depth;
-        }
-      }
-    }
+    // First check if this value has already been computed.
+    SVI = SibValues.find(VNI);
+    assert(SVI != SibValues.end() && "Missing SibValues entry");
 
     // Trace through PHI-defs created by live range splitting.
     if (VNI->isPHIDef()) {
+      // Stop at original PHIs.  We don't know the value at the predecessors.
       if (VNI->def == OrigVNI->def) {
-        DEBUG(dbgs() << "  orig phi value " << PrintReg(Reg) << ':'
-                     << VNI->id << '@' << VNI->def << '\n');
-        SeenOrigPHI = true;
+        DEBUG(dbgs() << "orig phi value\n");
+        SVI->second.DefByOrigPHI = true;
+        SVI->second.AllDefsAreReloads = false;
+        propagateSiblingValue(SVI);
         continue;
       }
-      // Get values live-out of predecessors.
+
+      // This is a PHI inserted by live range splitting.  We could trace the
+      // live-out value from predecessor blocks, but that search can be very
+      // expensive if there are many predecessors and many more PHIs as
+      // generated by tail-dup when it sees an indirectbr.  Instead, look at
+      // all the non-PHI defs that have the same value as OrigVNI.  They must
+      // jointly dominate VNI->def.  This is not optimal since VNI may actually
+      // be jointly dominated by a smaller subset of defs, so there is a change
+      // we will miss a AllDefsAreReloads optimization.
+
+      // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
+      SmallVector<VNInfo*, 8> PHIs, NonPHIs;
       LiveInterval &LI = LIS.getInterval(Reg);
-      MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
-      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-             PE = MBB->pred_end(); PI != PE; ++PI) {
-        VNInfo *PVNI = LI.getVNInfoAt(LIS.getMBBEndIdx(*PI).getPrevSlot());
-        if (PVNI)
-          WorkList.push_back(std::make_pair(Reg, PVNI));
+      LiveInterval &OrigLI = LIS.getInterval(Original);
+
+      for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
+           VI != VE; ++VI) {
+        VNInfo *VNI2 = *VI;
+        if (VNI2->isUnused())
+          continue;
+        if (!OrigLI.containsOneValue() &&
+            OrigLI.getVNInfoAt(VNI2->def) != OrigVNI)
+          continue;
+        if (VNI2->isPHIDef() && VNI2->def != OrigVNI->def)
+          PHIs.push_back(VNI2);
+        else
+          NonPHIs.push_back(VNI2);
+      }
+      DEBUG(dbgs() << "split phi value, checking " << PHIs.size()
+                   << " phi-defs, and " << NonPHIs.size()
+                   << " non-phi/orig defs\n");
+
+      // Create entries for all the PHIs.  Don't add them to the worklist, we
+      // are processing all of them in one go here.
+      for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+        SibValues.insert(std::make_pair(PHIs[i], SibValueInfo(Reg, PHIs[i])));
+
+      // Add every PHI as a dependent of all the non-PHIs.
+      for (unsigned i = 0, e = NonPHIs.size(); i != e; ++i) {
+        VNInfo *NonPHI = NonPHIs[i];
+        // Known value? Try an insertion.
+        tie(SVI, Inserted) =
+          SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
+        // Add all the PHIs as dependents of NonPHI.
+        for (unsigned pi = 0, pe = PHIs.size(); pi != pe; ++pi)
+          SVI->second.Deps.push_back(PHIs[pi]);
+        // This is the first time we see NonPHI, add it to the worklist.
+        if (Inserted)
+          WorkList.push_back(std::make_pair(Reg, NonPHI));
+        else
+          // Propagate to all inserted PHIs, not just VNI.
+          propagateSiblingValue(SVI);
       }
+
+      // Next work list item.
       continue;
     }
 
@@ -382,48 +578,49 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
     if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
       if (isSibling(SrcReg)) {
         LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        VNInfo *SrcVNI = SrcLI.getVNInfoAt(VNI->def.getUseIndex());
-        assert(SrcVNI && "Copy from non-existing value");
-        DEBUG(dbgs() << "  copy of " << PrintReg(SrcReg) << ':'
-                     << SrcVNI->id << '@' << SrcVNI->def << '\n');
-        WorkList.push_back(std::make_pair(SrcReg, SrcVNI));
+        LiveRange *SrcLR = SrcLI.getLiveRangeContaining(VNI->def.getUseIndex());
+        assert(SrcLR && "Copy from non-existing value");
+        // Check if this COPY kills its source.
+        SVI->second.KillsSource = (SrcLR->end == VNI->def);
+        VNInfo *SrcVNI = SrcLR->valno;
+        DEBUG(dbgs() << "copy of " << PrintReg(SrcReg) << ':'
+                     << SrcVNI->id << '@' << SrcVNI->def
+                     << " kill=" << unsigned(SVI->second.KillsSource) << '\n');
+        // Known sibling source value? Try an insertion.
+        tie(SVI, Inserted) = SibValues.insert(std::make_pair(SrcVNI,
+                                                 SibValueInfo(SrcReg, SrcVNI)));
+        // This is the first time we see Src, add it to the worklist.
+        if (Inserted)
+          WorkList.push_back(std::make_pair(SrcReg, SrcVNI));
+        propagateSiblingValue(SVI, VNI);
+        // Next work list item.
         continue;
       }
     }
 
     // Track reachable reloads.
+    SVI->second.DefMI = MI;
+    SVI->second.SpillMBB = MI->getParent();
     int FI;
     if (Reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot) {
-      DEBUG(dbgs() << "  reload " << PrintReg(Reg) << ':'
-                   << VNI->id << "@" << VNI->def << '\n');
-      SVI.AllDefsAreReloads = true;
+      DEBUG(dbgs() << "reload\n");
+      propagateSiblingValue(SVI);
+      // Next work list item.
       continue;
     }
 
-    // We have an 'original' def. Don't record trivial cases.
-    if (VNI == UseVNI) {
-      DEBUG(dbgs() << "Not a sibling copy.\n");
-      return MI;
-    }
-
     // Potential remat candidate.
-    DEBUG(dbgs() << "  def " << PrintReg(Reg) << ':'
-                 << VNI->id << '@' << VNI->def << '\t' << *MI);
-    SVI.DefMI = MI;
+    DEBUG(dbgs() << "def " << *MI);
+    SVI->second.AllDefsAreReloads = false;
+    propagateSiblingValue(SVI);
   } while (!WorkList.empty());
 
-  if (SeenOrigPHI || SVI.DefMI)
-    SVI.AllDefsAreReloads = false;
-
-  DEBUG({
-    if (SVI.AllDefsAreReloads)
-      dbgs() << "All defs are reloads.\n";
-    else
-      dbgs() << "Prefer to spill " << PrintReg(SVI.SpillReg) << ':'
-             << SVI.SpillVNI->id << '@' << SVI.SpillVNI->def << '\n';
-  });
-  SibValues.insert(std::make_pair(UseVNI, SVI));
-  return SVI.DefMI;
+  // Look up the value we were looking for.  We already did this lokup at the
+  // top of the function, but SibValues may have been invalidated.
+  SVI = SibValues.find(UseVNI);
+  assert(SVI != SibValues.end() && "Didn't compute requested info");
+  DEBUG(dbgs() << "  traced to:\t" << SVI->second);
+  return SVI->second.DefMI;
 }
 
 /// analyzeSiblingValues - Trace values defined by sibling copies back to
@@ -506,6 +703,7 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
 
   // Already spilled everywhere.
   if (SVI.AllDefsAreReloads) {
+    DEBUG(dbgs() << "\tno spill needed: " << SVI);
     ++NumOmitReloadSpill;
     return true;
   }
@@ -531,10 +729,8 @@ bool InlineSpiller::hoistSpill(LiveInterval &SpillLI, MachineInstr *CopyMI) {
   VRM.addSpillSlotUse(StackSlot, MII);
   DEBUG(dbgs() << "\thoisted: " << SVI.SpillVNI->def << '\t' << *MII);
 
-  if (MBB == CopyMI->getParent())
-    ++NumHoistLocal;
-  else
-    ++NumHoistGlobal;
+  ++NumSpills;
+  ++NumHoists;
   return true;
 }
 
@@ -589,7 +785,8 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
         // eliminateDeadDefs won't normally remove stores, so switch opcode.
         MI->setDesc(TII.get(TargetOpcode::KILL));
         DeadDefs.push_back(MI);
-        ++NumRedundantSpills;
+        ++NumSpillsRemoved;
+        --NumSpills;
       }
     }
   } while (!WorkList.empty());
@@ -637,7 +834,7 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
 bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
                                      MachineBasicBlock::iterator MI) {
   SlotIndex UseIdx = LIS.getInstructionIndex(MI).getUseIndex();
-  VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx);
+  VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex());
 
   if (!ParentVNI) {
     DEBUG(dbgs() << "\tadding <undef> flags: ");
@@ -787,10 +984,10 @@ void InlineSpiller::reMaterializeAll() {
 /// If MI is a load or store of StackSlot, it can be removed.
 bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   int FI = 0;
-  unsigned InstrReg;
-  if (!(InstrReg = TII.isLoadFromStackSlot(MI, FI)) &&
-      !(InstrReg = TII.isStoreToStackSlot(MI, FI)))
-    return false;
+  unsigned InstrReg = TII.isLoadFromStackSlot(MI, FI);
+  bool IsLoad = InstrReg;
+  if (!IsLoad)
+    InstrReg = TII.isStoreToStackSlot(MI, FI);
 
   // We have a stack access. Is it the right register and slot?
   if (InstrReg != Reg || FI != StackSlot)
@@ -799,6 +996,15 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   DEBUG(dbgs() << "Coalescing stack access: " << *MI);
   LIS.RemoveMachineInstrFromMaps(MI);
   MI->eraseFromParent();
+
+  if (IsLoad) {
+    ++NumReloadsRemoved;
+    --NumReloads;
+  } else {
+    ++NumSpillsRemoved;
+    --NumSpills;
+  }
+
   return true;
 }
 
@@ -810,6 +1016,7 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
 bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) {
+  bool WasCopy = MI->isCopy();
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
   // operands.
   SmallVector<unsigned, 8> FoldOps;
@@ -839,7 +1046,12 @@ bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
     VRM.addSpillSlotUse(StackSlot, FoldMI);
   MI->eraseFromParent();
   DEBUG(dbgs() << "\tfolded: " << *FoldMI);
-  ++NumFolded;
+  if (!WasCopy)
+    ++NumFolded;
+  else if (Ops.front() == 0)
+    ++NumSpills;
+  else
+    ++NumReloads;
   return true;
 }
 
@@ -975,8 +1187,16 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI);
 
     // FIXME: Use a second vreg if instruction has no tied ops.
-    if (Writes && hasLiveDef)
+    if (Writes) {
+     if (hasLiveDef)
       insertSpill(NewLI, OldLI, Idx, MI);
+     else {
+       // This instruction defines a dead value.  We don't need to spill it,
+       // but do create a live range for the dead value.
+       VNInfo *VNI = NewLI.getNextValue(Idx, 0, LIS.getVNInfoAllocator());
+       NewLI.addRange(LiveRange(Idx, Idx.getNextSlot(), VNI));
+     }
+    }
 
     DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
   }
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index a09bb39..29b47bd 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -18,10 +18,13 @@
 
 using namespace llvm;
 
+// Static member used for null interference cursors.
+InterferenceCache::BlockInterference InterferenceCache::Cursor::NoInterference;
+
 void InterferenceCache::init(MachineFunction *mf,
                              LiveIntervalUnion *liuarray,
                              SlotIndexes *indexes,
-                            const TargetRegisterInfo *tri) {
+                             const TargetRegisterInfo *tri) {
   MF = mf;
   LIUArray = liuarray;
   TRI = tri;
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index 7f0a27a..4df0a9e 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -138,6 +138,7 @@ public:
   class Cursor {
     Entry *CacheEntry;
     BlockInterference *Current;
+    static BlockInterference NoInterference;
 
     void setEntry(Entry *E) {
       Current = 0;
@@ -175,7 +176,7 @@ public:
 
     /// moveTo - Move cursor to basic block MBBNum.
     void moveToBlock(unsigned MBBNum) {
-      Current = CacheEntry->get(MBBNum);
+      Current = CacheEntry ? CacheEntry->get(MBBNum) : &NoInterference;
     }
 
     /// hasInterference - Return true if the current block has any interference.
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 611886f..0f92c2d 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -27,7 +27,7 @@ using namespace llvm;
 template <class ArgIt>
 static void EnsureFunctionExists(Module &M, const char *Name,
                                  ArgIt ArgBegin, ArgIt ArgEnd,
-                                 const Type *RetTy) {
+                                 Type *RetTy) {
   // Insert a correctly-typed definition now.
   std::vector<Type *> ParamTys;
   for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
@@ -64,7 +64,7 @@ static void EnsureFPIntrinsicsExist(Module &M, Function *Fn,
 template <class ArgIt>
 static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
                                  ArgIt ArgBegin, ArgIt ArgEnd,
-                                 const Type *RetTy) {
+                                 Type *RetTy) {
   // If we haven't already looked up this function, check to see if the
   // program already contains a function with this name.
   Module *M = CI->getParent()->getParent()->getParent();
@@ -462,7 +462,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;   // Strip out annotate intrinsic
     
   case Intrinsic::memcpy: {
-    const IntegerType *IntPtr = TD.getIntPtrType(Context);
+    IntegerType *IntPtr = TD.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
@@ -473,7 +473,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memmove: {
-    const IntegerType *IntPtr = TD.getIntPtrType(Context);
+    IntegerType *IntPtr = TD.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
@@ -484,7 +484,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memset: {
-    const IntegerType *IntPtr = TD.getIntPtrType(Context);
+    IntegerType *IntPtr = TD.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index f985af8..80ecc22 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -27,16 +27,18 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 namespace llvm {
@@ -55,8 +57,12 @@ static cl::opt<bool> DisableCodePlace("disable-code-place", cl::Hidden,
     cl::desc("Disable code placement"));
 static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
     cl::desc("Disable Stack Slot Coloring"));
+static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
+    cl::desc("Disable Machine Dead Code Elimination"));
 static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
     cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
+    cl::desc("Disable Machine Common Subexpression Elimination"));
 static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
     cl::Hidden,
     cl::desc("Disable Machine LICM"));
@@ -103,20 +109,17 @@ EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
 LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
-                                     StringRef CPU, StringRef FS)
+                                     StringRef CPU, StringRef FS,
+                                     Reloc::Model RM, CodeModel::Model CM)
   : TargetMachine(T, Triple, CPU, FS) {
+  CodeGenInfo = T.createMCCodeGenInfo(Triple, RM, CM);
   AsmInfo = T.createMCAsmInfo(Triple);
-}
-
-// Set the default code model for the JIT for a generic target.
-// FIXME: Is small right here? or .is64Bit() ? Large : Small?
-void LLVMTargetMachine::setCodeModelForJIT() {
-  setCodeModel(CodeModel::Small);
-}
-
-// Set the default code model for static compilation for a generic target.
-void LLVMTargetMachine::setCodeModelForStatic() {
-  setCodeModel(CodeModel::Small);
+  // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
+  // and if the old one gets included then MCAsmInfo will be NULL and
+  // we'll crash later.
+  // Provide the user with a useful error message about what's wrong.
+  assert(AsmInfo && "MCAsmInfo not initialized."
+	 "Make sure you include the correct TargetSelect.h!");
 }
 
 bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
@@ -134,21 +137,22 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     Context->setAllowTemporaryLabels(false);
 
   const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   OwningPtr<MCStreamer> AsmStreamer;
 
   switch (FileType) {
   default: return true;
   case CGFT_AssemblyFile: {
     MCInstPrinter *InstPrinter =
-      getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI);
+      getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI, STI);
 
     // Create a code emitter if asked to show the encoding.
     MCCodeEmitter *MCE = 0;
-    TargetAsmBackend *TAB = 0;
+    MCAsmBackend *MAB = 0;
     if (ShowMCEncoding) {
       const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-      MCE = getTarget().createCodeEmitter(*getInstrInfo(), STI, *Context);
-      TAB = getTarget().createAsmBackend(getTargetTriple());
+      MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI, *Context);
+      MAB = getTarget().createMCAsmBackend(getTargetTriple());
     }
 
     MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
@@ -156,7 +160,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                                   hasMCUseLoc(),
                                                   hasMCUseCFI(),
                                                   InstPrinter,
-                                                  MCE, TAB,
+                                                  MCE, MAB,
                                                   ShowMCInst);
     AsmStreamer.reset(S);
     break;
@@ -164,17 +168,16 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_ObjectFile: {
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
-    const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-    MCCodeEmitter *MCE = getTarget().createCodeEmitter(*getInstrInfo(), STI,
-                                                       *Context);
-    TargetAsmBackend *TAB = getTarget().createAsmBackend(getTargetTriple());
-    if (MCE == 0 || TAB == 0)
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), STI,
+                                                         *Context);
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
+    if (MCE == 0 || MAB == 0)
       return true;
 
-    AsmStreamer.reset(getTarget().createObjectStreamer(getTargetTriple(),
-                                                       *Context, *TAB, Out, MCE,
-                                                       hasMCRelaxAll(),
-                                                       hasMCNoExecStack()));
+    AsmStreamer.reset(getTarget().createMCObjectStreamer(getTargetTriple(),
+                                                         *Context, *MAB, Out,
+                                                         MCE, hasMCRelaxAll(),
+                                                         hasMCNoExecStack()));
     AsmStreamer.get()->InitSections();
     break;
   }
@@ -198,8 +201,6 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 
   PM.add(Printer);
 
-  // Make sure the code model is set.
-  setCodeModelForStatic();
   PM.add(createGCInfoDeleter());
   return false;
 }
@@ -214,9 +215,6 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
                                                    JITCodeEmitter &JCE,
                                                    CodeGenOpt::Level OptLevel,
                                                    bool DisableVerify) {
-  // Make sure the code model is set.
-  setCodeModelForJIT();
-
   // Add common CodeGen passes.
   MCContext *Ctx = 0;
   if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Ctx))
@@ -248,16 +246,16 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
   // Create the code emitter for the target if it exists.  If not, .o file
   // emission fails.
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-  MCCodeEmitter *MCE = getTarget().createCodeEmitter(*getInstrInfo(),STI, *Ctx);
-  TargetAsmBackend *TAB = getTarget().createAsmBackend(getTargetTriple());
-  if (MCE == 0 || TAB == 0)
+  MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(),STI, *Ctx);
+  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
+  if (MCE == 0 || MAB == 0)
     return true;
 
   OwningPtr<MCStreamer> AsmStreamer;
-  AsmStreamer.reset(getTarget().createObjectStreamer(getTargetTriple(), *Ctx,
-                                                     *TAB, Out, MCE,
-                                                     hasMCRelaxAll(),
-                                                     hasMCNoExecStack()));
+  AsmStreamer.reset(getTarget().createMCObjectStreamer(getTargetTriple(), *Ctx,
+                                                       *MAB, Out, MCE,
+                                                       hasMCRelaxAll(),
+                                                       hasMCNoExecStack()));
   AsmStreamer.get()->InitSections();
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
@@ -270,9 +268,6 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
 
   PM.add(Printer);
 
-  // Make sure the code model is set.
-  setCodeModelForJIT();
-
   return false; // success!
 }
 
@@ -369,8 +364,9 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
 
   // Install a MachineModuleInfo class, which is an immutable pass that holds
   // all the per-module stuff we're generating, including MCContext.
-  TargetAsmInfo *TAI = new TargetAsmInfo(*this);
-  MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo(), TAI);
+  MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo(),
+                                                 *getRegisterInfo(),
+                                     &getTargetLowering()->getObjFileLowering());
   PM.add(MMI);
   OutContext = &MMI->getContext(); // Return the MCContext specifically by-ref.
 
@@ -412,12 +408,14 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     // there is one known exception: lowered code for arguments that are only
     // used by tail calls, where the tail calls reuse the incoming stack
     // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
-    PM.add(createDeadMachineInstructionElimPass());
+    if (!DisableMachineDCE)
+      PM.add(createDeadMachineInstructionElimPass());
     printAndVerify(PM, "After codegen DCE pass");
 
     if (!DisableMachineLICM)
       PM.add(createMachineLICMPass());
-    PM.add(createMachineCSEPass());
+    if (!DisableMachineCSE)
+      PM.add(createMachineCSEPass());
     if (!DisableMachineSink)
       PM.add(createMachineSinkingPass());
     printAndVerify(PM, "After Machine LICM, CSE and Sinking passes");
@@ -452,8 +450,8 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   if (addPostRegAlloc(PM, OptLevel))
     printAndVerify(PM, "After PostRegAlloc passes");
 
-  PM.add(createLowerSubregsPass());
-  printAndVerify(PM, "After LowerSubregs");
+  PM.add(createExpandPostRAPseudosPass());
+  printAndVerify(PM, "After ExpandPostRAPseudos");
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
   PM.add(createPrologEpilogCodeInserter());
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
new file mode 100644
index 0000000..a12e1a3
--- /dev/null
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -0,0 +1,335 @@
+//===- LexicalScopes.cpp - Collecting lexical scope info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements LexicalScopes analysis.
+//
+// This pass collects lexical scope information and maps machine instructions
+// to respective lexical scopes.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lexicalscopes"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/Function.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+LexicalScopes::~LexicalScopes() {
+  releaseMemory();
+}
+
+/// releaseMemory - release memory.
+void LexicalScopes::releaseMemory() {
+  MF = NULL;
+  CurrentFnLexicalScope = NULL;
+  DeleteContainerSeconds(LexicalScopeMap);
+  DeleteContainerSeconds(AbstractScopeMap);
+  InlinedLexicalScopeMap.clear();
+  AbstractScopesList.clear();
+}
+
+/// initialize - Scan machine function and constuct lexical scope nest.
+void LexicalScopes::initialize(const MachineFunction &Fn) {
+  releaseMemory();
+  MF = &Fn;
+  SmallVector<InsnRange, 4> MIRanges;
+  DenseMap<const MachineInstr *, LexicalScope *> MI2ScopeMap;
+  extractLexicalScopes(MIRanges, MI2ScopeMap);
+  if (CurrentFnLexicalScope) {
+    constructScopeNest(CurrentFnLexicalScope);
+    assignInstructionRanges(MIRanges, MI2ScopeMap);
+  }
+}
+
+/// extractLexicalScopes - Extract instruction ranges for each lexical scopes
+/// for the given machine function.
+void LexicalScopes::
+extractLexicalScopes(SmallVectorImpl<InsnRange> &MIRanges,
+                  DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
+
+  // Scan each instruction and create scopes. First build working set of scopes.
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+       I != E; ++I) {
+    const MachineInstr *RangeBeginMI = NULL;
+    const MachineInstr *PrevMI = NULL;
+    DebugLoc PrevDL;
+    for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
+         II != IE; ++II) {
+      const MachineInstr *MInsn = II;
+
+      // Check if instruction has valid location information.
+      const DebugLoc MIDL = MInsn->getDebugLoc();
+      if (MIDL.isUnknown()) {
+        PrevMI = MInsn;
+        continue;
+      }
+
+      // If scope has not changed then skip this instruction.
+      if (MIDL == PrevDL) {
+        PrevMI = MInsn;
+        continue;
+      }
+
+      // Ignore DBG_VALUE. It does not contribute to any instruction in output.
+      if (MInsn->isDebugValue())
+        continue;
+
+      if (RangeBeginMI) {
+        // If we have already seen a beginning of an instruction range and
+        // current instruction scope does not match scope of first instruction
+        // in this range then create a new instruction range.
+        InsnRange R(RangeBeginMI, PrevMI);
+        MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+        MIRanges.push_back(R);
+      }
+
+      // This is a beginning of a new instruction range.
+      RangeBeginMI = MInsn;
+
+      // Reset previous markers.
+      PrevMI = MInsn;
+      PrevDL = MIDL;
+    }
+
+    // Create last instruction range.
+    if (RangeBeginMI && PrevMI && !PrevDL.isUnknown()) {
+      InsnRange R(RangeBeginMI, PrevMI);
+      MIRanges.push_back(R);
+      MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+    }
+  }
+}
+
+/// findLexicalScope - Find lexical scope, either regular or inlined, for the
+/// given DebugLoc. Return NULL if not found.
+LexicalScope *LexicalScopes::findLexicalScope(DebugLoc DL) {
+  MDNode *Scope = NULL;
+  MDNode *IA = NULL;
+  DL.getScopeAndInlinedAt(Scope, IA, MF->getFunction()->getContext());
+  if (!Scope) return NULL;
+
+  // The scope that we were created with could have an extra file - which
+  // isn't what we care about in this case.
+  DIDescriptor D = DIDescriptor(Scope);
+  if (D.isLexicalBlockFile())
+    Scope = DILexicalBlockFile(Scope).getScope();
+  
+  if (IA)
+    return InlinedLexicalScopeMap.lookup(DebugLoc::getFromDILocation(IA));
+  return LexicalScopeMap.lookup(Scope);
+}
+
+/// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
+/// not available then create new lexical scope.
+LexicalScope *LexicalScopes::getOrCreateLexicalScope(DebugLoc DL) {
+  MDNode *Scope = NULL;
+  MDNode *InlinedAt = NULL;
+  DL.getScopeAndInlinedAt(Scope, InlinedAt, MF->getFunction()->getContext());
+
+  if (InlinedAt) {
+    // Create an abstract scope for inlined function.
+    getOrCreateAbstractScope(Scope);
+    // Create an inlined scope for inlined function.
+    return getOrCreateInlinedScope(Scope, InlinedAt);
+  }
+   
+  return getOrCreateRegularScope(Scope);
+}
+
+/// getOrCreateRegularScope - Find or create a regular lexical scope.
+LexicalScope *LexicalScopes::getOrCreateRegularScope(MDNode *Scope) {
+  DIDescriptor D = DIDescriptor(Scope);
+  if (D.isLexicalBlockFile()) {
+    Scope = DILexicalBlockFile(Scope).getScope();
+    D = DIDescriptor(Scope);
+  }
+ 
+  LexicalScope *WScope = LexicalScopeMap.lookup(Scope);
+  if (WScope)
+    return WScope;
+
+  LexicalScope *Parent = NULL;
+  if (D.isLexicalBlock())
+    Parent = getOrCreateLexicalScope(DebugLoc::getFromDILexicalBlock(Scope));
+  WScope = new LexicalScope(Parent, DIDescriptor(Scope), NULL, false);
+  LexicalScopeMap.insert(std::make_pair(Scope, WScope));
+  if (!Parent && DIDescriptor(Scope).isSubprogram()
+      && DISubprogram(Scope).describes(MF->getFunction()))
+    CurrentFnLexicalScope = WScope;
+  
+  return WScope;
+}
+
+/// getOrCreateInlinedScope - Find or create an inlined lexical scope.
+LexicalScope *LexicalScopes::getOrCreateInlinedScope(MDNode *Scope, 
+                                                     MDNode *InlinedAt) {
+  LexicalScope *InlinedScope = LexicalScopeMap.lookup(InlinedAt);
+  if (InlinedScope)
+    return InlinedScope;
+
+  DebugLoc InlinedLoc = DebugLoc::getFromDILocation(InlinedAt);
+  InlinedScope = new LexicalScope(getOrCreateLexicalScope(InlinedLoc),
+                                  DIDescriptor(Scope), InlinedAt, false);
+  InlinedLexicalScopeMap[InlinedLoc] = InlinedScope;
+  LexicalScopeMap[InlinedAt] = InlinedScope;
+  return InlinedScope;
+}
+
+/// getOrCreateAbstractScope - Find or create an abstract lexical scope.
+LexicalScope *LexicalScopes::getOrCreateAbstractScope(const MDNode *N) {
+  assert(N && "Invalid Scope encoding!");
+
+  DIDescriptor Scope(N);
+  if (Scope.isLexicalBlockFile())
+    Scope = DILexicalBlockFile(Scope).getScope();
+  LexicalScope *AScope = AbstractScopeMap.lookup(N);
+  if (AScope)
+    return AScope;
+
+  LexicalScope *Parent = NULL;
+  if (Scope.isLexicalBlock()) {
+    DILexicalBlock DB(N);
+    DIDescriptor ParentDesc = DB.getContext();
+    Parent = getOrCreateAbstractScope(ParentDesc);
+  }
+  AScope = new LexicalScope(Parent, DIDescriptor(N), NULL, true);
+  AbstractScopeMap[N] = AScope;
+  if (DIDescriptor(N).isSubprogram())
+    AbstractScopesList.push_back(AScope);
+  return AScope;
+}
+
+/// constructScopeNest
+void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
+  assert (Scope && "Unable to calculate scop edominance graph!");
+  SmallVector<LexicalScope *, 4> WorkStack;
+  WorkStack.push_back(Scope);
+  unsigned Counter = 0;
+  while (!WorkStack.empty()) {
+    LexicalScope *WS = WorkStack.back();
+    const SmallVector<LexicalScope *, 4> &Children = WS->getChildren();
+    bool visitedChildren = false;
+    for (SmallVector<LexicalScope *, 4>::const_iterator SI = Children.begin(),
+           SE = Children.end(); SI != SE; ++SI) {
+      LexicalScope *ChildScope = *SI;
+      if (!ChildScope->getDFSOut()) {
+        WorkStack.push_back(ChildScope);
+        visitedChildren = true;
+        ChildScope->setDFSIn(++Counter);
+        break;
+      }
+    }
+    if (!visitedChildren) {
+      WorkStack.pop_back();
+      WS->setDFSOut(++Counter);
+    }
+  }
+}
+
+/// assignInstructionRanges - Find ranges of instructions covered by each
+/// lexical scope.
+void LexicalScopes::
+assignInstructionRanges(SmallVectorImpl<InsnRange> &MIRanges,
+                    DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap)
+{
+  
+  LexicalScope *PrevLexicalScope = NULL;
+  for (SmallVectorImpl<InsnRange>::const_iterator RI = MIRanges.begin(),
+         RE = MIRanges.end(); RI != RE; ++RI) {
+    const InsnRange &R = *RI;
+    LexicalScope *S = MI2ScopeMap.lookup(R.first);
+    assert (S && "Lost LexicalScope for a machine instruction!");
+    if (PrevLexicalScope && !PrevLexicalScope->dominates(S))
+      PrevLexicalScope->closeInsnRange(S);
+    S->openInsnRange(R.first);
+    S->extendInsnRange(R.second);
+    PrevLexicalScope = S;
+  }
+
+  if (PrevLexicalScope)
+    PrevLexicalScope->closeInsnRange();
+}
+
+/// getMachineBasicBlocks - Populate given set using machine basic blocks which
+/// have machine instructions that belong to lexical scope identified by 
+/// DebugLoc.
+void LexicalScopes::
+getMachineBasicBlocks(DebugLoc DL, 
+                      SmallPtrSet<const MachineBasicBlock*, 4> &MBBs) {
+  MBBs.clear();
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return;
+  
+  if (Scope == CurrentFnLexicalScope) {
+    for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
+         I != E; ++I)
+      MBBs.insert(I);
+    return;
+  }
+
+  SmallVector<InsnRange, 4> &InsnRanges = Scope->getRanges();
+  for (SmallVector<InsnRange, 4>::iterator I = InsnRanges.begin(),
+         E = InsnRanges.end(); I != E; ++I) {
+    InsnRange &R = *I;
+    MBBs.insert(R.first->getParent());
+  }
+}
+
+/// dominates - Return true if DebugLoc's lexical scope dominates at least one
+/// machine instruction's lexical scope in a given machine basic block.
+bool LexicalScopes::dominates(DebugLoc DL, MachineBasicBlock *MBB) {
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return false;
+
+  // Current function scope covers all basic blocks in the function.
+  if (Scope == CurrentFnLexicalScope && MBB->getParent() == MF)
+    return true;
+
+  bool Result = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    DebugLoc IDL = I->getDebugLoc();
+    if (IDL.isUnknown())
+      continue;
+    if (LexicalScope *IScope = getOrCreateLexicalScope(IDL))
+      if (Scope->dominates(IScope))
+        return true;
+  }
+  return Result;
+}
+
+/// dump - Print data structures.
+void LexicalScope::dump() const {
+#ifndef NDEBUG
+  raw_ostream &err = dbgs();
+  err.indent(IndentLevel);
+  err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
+  const MDNode *N = Desc;
+  N->dump();
+  if (AbstractScope)
+    err << "Abstract Scope\n";
+
+  IndentLevel += 2;
+  if (!Children.empty())
+    err << "Children ...\n";
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    if (Children[i] != this)
+      Children[i]->dump();
+
+  IndentLevel -= 2;
+#endif
+}
+
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 5d38c83..3dfe4c0 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -25,7 +25,10 @@
 #include "llvm/Constants.h"
 #include "llvm/Metadata.h"
 #include "llvm/Value.h"
+#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -44,6 +47,7 @@ static cl::opt<bool>
 EnableLDV("live-debug-variables", cl::init(true),
           cl::desc("Enable the live debug variables pass"), cl::Hidden);
 
+STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted");
 char LiveDebugVariables::ID = 0;
 
 INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars",
@@ -67,6 +71,29 @@ LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID), pImpl(0) {
 /// LocMap - Map of where a user value is live, and its location.
 typedef IntervalMap<SlotIndex, unsigned, 4> LocMap;
 
+namespace {
+/// UserValueScopes - Keeps track of lexical scopes associated with an
+/// user value's source location.
+class UserValueScopes {
+  DebugLoc DL;
+  LexicalScopes &LS;
+  SmallPtrSet<const MachineBasicBlock *, 4> LBlocks;
+
+public:
+  UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(D), LS(L) {}
+
+  /// dominates - Return true if current scope dominates at least one machine
+  /// instruction in a given machine basic block.
+  bool dominates(MachineBasicBlock *MBB) {
+    if (LBlocks.empty())
+      LS.getMachineBasicBlocks(DL, LBlocks);
+    if (LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB))
+      return true;
+    return false;
+  }
+};
+} // end anonymous namespace
+
 /// UserValue - A user value is a part of a debug info user variable.
 ///
 /// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
@@ -179,6 +206,9 @@ public:
     LocMap::iterator I = locInts.find(Idx);
     if (!I.valid() || I.start() != Idx)
       I.insert(Idx, Idx.getNextSlot(), getLocationNo(LocMO));
+    else
+      // A later DBG_VALUE at the same SlotIndex overrides the old location.
+      I.setValue(getLocationNo(LocMO));
   }
 
   /// extendDef - Extend the current definition as far as possible down the
@@ -195,7 +225,8 @@ public:
   void extendDef(SlotIndex Idx, unsigned LocNo,
                  LiveInterval *LI, const VNInfo *VNI,
                  SmallVectorImpl<SlotIndex> *Kills,
-                 LiveIntervals &LIS, MachineDominatorTree &MDT);
+                 LiveIntervals &LIS, MachineDominatorTree &MDT,
+		 UserValueScopes &UVS);
 
   /// addDefsFromCopies - The value in LI/LocNo may be copies to other
   /// registers. Determine if any of the copies are available at the kill
@@ -213,7 +244,8 @@ public:
   /// computeIntervals - Compute the live intervals of all locations after
   /// collecting all their def points.
   void computeIntervals(MachineRegisterInfo &MRI,
-                        LiveIntervals &LIS, MachineDominatorTree &MDT);
+                        LiveIntervals &LIS, MachineDominatorTree &MDT,
+                        UserValueScopes &UVS);
 
   /// renameRegister - Update locations to rewrite OldReg as NewReg:SubIdx.
   void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx,
@@ -236,6 +268,9 @@ public:
   /// Only first one needs DebugLoc to identify variable's lexical scope
   /// in source file.
   DebugLoc findDebugLoc();
+
+  /// getDebugLoc - Return DebugLoc of this UserValue.
+  DebugLoc getDebugLoc() { return dl;}
   void print(raw_ostream&, const TargetMachine*);
 };
 } // namespace
@@ -247,6 +282,7 @@ class LDVImpl {
   LocMap::Allocator allocator;
   MachineFunction *MF;
   LiveIntervals *LIS;
+  LexicalScopes LS;
   MachineDominatorTree *MDT;
   const TargetRegisterInfo *TRI;
 
@@ -312,8 +348,10 @@ public:
 } // namespace
 
 void UserValue::print(raw_ostream &OS, const TargetMachine *TM) {
-  if (const MDString *MDS = dyn_cast<MDString>(variable->getOperand(2)))
-    OS << "!\"" << MDS->getString() << "\"\t";
+  DIVariable DV(variable);
+  OS << "!\""; 
+  DV.printExtendedName(OS);
+  OS << "\"\t";
   if (offset)
     OS << '+' << offset;
   for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I) {
@@ -447,10 +485,10 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
 void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
                           LiveInterval *LI, const VNInfo *VNI,
                           SmallVectorImpl<SlotIndex> *Kills,
-                          LiveIntervals &LIS, MachineDominatorTree &MDT) {
+                          LiveIntervals &LIS, MachineDominatorTree &MDT,
+			  UserValueScopes &UVS) {
   SmallVector<SlotIndex, 16> Todo;
   Todo.push_back(Idx);
-
   do {
     SlotIndex Start = Todo.pop_back_val();
     MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start);
@@ -497,8 +535,11 @@ void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
       continue;
     const std::vector<MachineDomTreeNode*> &Children =
       MDT.getNode(MBB)->getChildren();
-    for (unsigned i = 0, e = Children.size(); i != e; ++i)
-      Todo.push_back(LIS.getMBBStartIdx(Children[i]->getBlock()));
+    for (unsigned i = 0, e = Children.size(); i != e; ++i) {
+      MachineBasicBlock *MBB = Children[i]->getBlock();
+      if (UVS.dominates(MBB))
+        Todo.push_back(LIS.getMBBStartIdx(MBB));
+    }
   } while (!Todo.empty());
 }
 
@@ -578,7 +619,8 @@ UserValue::addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
 void
 UserValue::computeIntervals(MachineRegisterInfo &MRI,
                             LiveIntervals &LIS,
-                            MachineDominatorTree &MDT) {
+                            MachineDominatorTree &MDT,
+			    UserValueScopes &UVS) {
   SmallVector<std::pair<SlotIndex, unsigned>, 16> Defs;
 
   // Collect all defs to be extended (Skipping undefs).
@@ -597,10 +639,10 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
       LiveInterval *LI = &LIS.getInterval(Loc.getReg());
       const VNInfo *VNI = LI->getVNInfoAt(Idx);
       SmallVector<SlotIndex, 16> Kills;
-      extendDef(Idx, LocNo, LI, VNI, &Kills, LIS, MDT);
+      extendDef(Idx, LocNo, LI, VNI, &Kills, LIS, MDT, UVS);
       addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS);
     } else
-      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT);
+      extendDef(Idx, LocNo, 0, 0, 0, LIS, MDT, UVS);
   }
 
   // Finally, erase all the undefs.
@@ -613,7 +655,8 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
 void LDVImpl::computeIntervals() {
   for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
-    userValues[i]->computeIntervals(MF->getRegInfo(), *LIS, *MDT);
+    UserValueScopes UVS(userValues[i]->getDebugLoc(), LS);
+    userValues[i]->computeIntervals(MF->getRegInfo(), *LIS, *MDT, UVS);
     userValues[i]->mapVirtRegs(this);
   }
 }
@@ -624,6 +667,7 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
   MDT = &pass.getAnalysis<MachineDominatorTree>();
   TRI = mf.getTarget().getRegisterInfo();
   clear();
+  LS.initialize(mf);
   DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
                << ((Value*)mf.getFunction())->getName()
                << " **********\n");
@@ -631,6 +675,7 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
   bool Changed = collectDebugValues(mf);
   computeIntervals();
   DEBUG(print(dbgs()));
+  LS.releaseMemory();
   return Changed;
 }
 
@@ -891,6 +936,7 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,
                                  const TargetInstrInfo &TII) {
   MachineBasicBlock::iterator I = findInsertLocation(MBB, Idx, LIS);
   MachineOperand &Loc = locations[LocNo];
+  ++NumInsertedDebugValues;
 
   // Frame index locations may require a target callback.
   if (Loc.isFI()) {
@@ -921,7 +967,6 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
 
     DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd);
     insertDebugValue(MBB, Start, LocNo, LIS, TII);
-
     // This interval may span multiple basic blocks.
     // Insert a DBG_VALUE into each one.
     while(Stop > MBBEnd) {
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index cfade24..b69945a 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -148,7 +148,6 @@ void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
 /// remaining unused values.
 void LiveInterval::RenumberValues(LiveIntervals &lis) {
   SmallPtrSet<VNInfo*, 8> Seen;
-  bool seenPHIDef = false;
   valnos.clear();
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     VNInfo *VNI = I->valno;
@@ -157,26 +156,6 @@ void LiveInterval::RenumberValues(LiveIntervals &lis) {
     assert(!VNI->isUnused() && "Unused valno used by live range");
     VNI->id = (unsigned)valnos.size();
     valnos.push_back(VNI);
-    VNI->setHasPHIKill(false);
-    if (VNI->isPHIDef())
-      seenPHIDef = true;
-  }
-
-  // Recompute phi kill flags.
-  if (!seenPHIDef)
-    return;
-  for (const_vni_iterator I = vni_begin(), E = vni_end(); I != E; ++I) {
-    VNInfo *VNI = *I;
-    if (!VNI->isPHIDef())
-      continue;
-    const MachineBasicBlock *PHIBB = lis.getMBBFromIndex(VNI->def);
-    assert(PHIBB && "No basic block for phi-def");
-    for (MachineBasicBlock::const_pred_iterator PI = PHIBB->pred_begin(),
-         PE = PHIBB->pred_end(); PI != PE; ++PI) {
-      VNInfo *KVNI = getVNInfoAt(lis.getMBBEndIdx(*PI).getPrevSlot());
-      if (KVNI)
-        KVNI->setHasPHIKill(true);
-    }
   }
 }
 
@@ -294,20 +273,20 @@ LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
   return ranges.insert(it, LR);
 }
 
-/// extendInBlock - If this interval is live before UseIdx in the basic
-/// block that starts at StartIdx, extend it to be live at UseIdx and return
-/// the value. If there is no live range before UseIdx, return NULL.
-VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex UseIdx) {
+/// extendInBlock - If this interval is live before Kill in the basic
+/// block that starts at StartIdx, extend it to be live up to Kill and return
+/// the value. If there is no live range before Kill, return NULL.
+VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (empty())
     return 0;
-  iterator I = std::upper_bound(begin(), end(), UseIdx);
+  iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
   if (I == begin())
     return 0;
   --I;
   if (I->end <= StartIdx)
     return 0;
-  if (I->end <= UseIdx)
-    extendIntervalEndTo(I, UseIdx.getNextSlot());
+  if (I->end < Kill)
+    extendIntervalEndTo(I, Kill);
   return I->valno;
 }
 
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 9257191..b1e202a 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -304,8 +304,19 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
 
     // Make sure the first definition is not a partial redefinition. Add an
     // <imp-def> of the full register.
-    if (MO.getSubReg())
+    // FIXME: LiveIntervals shouldn't modify the code like this.  Whoever
+    // created the machine instruction should annotate it with <undef> flags
+    // as needed.  Then we can simply assert here.  The REG_SEQUENCE lowering
+    // is the main suspect.
+    if (MO.getSubReg()) {
       mi->addRegisterDefined(interval.reg);
+      // Mark all defs of interval.reg on this instruction as reading <undef>.
+      for (unsigned i = MOIdx, e = mi->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO2 = mi->getOperand(i);
+        if (MO2.isReg() && MO2.getReg() == interval.reg && MO2.getSubReg())
+          MO2.setIsUndef();
+      }
+    }
 
     MachineInstr *CopyMI = NULL;
     if (mi->isCopyLike()) {
@@ -747,6 +758,9 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   // Find all the values used, including PHI kills.
   SmallVector<std::pair<SlotIndex, VNInfo*>, 16> WorkList;
 
+  // Blocks that have already been added to WorkList as live-out.
+  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+
   // Visit all instructions reading li->reg.
   for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(li->reg);
        MachineInstr *UseMI = I.skipInstruction();) {
@@ -780,8 +794,6 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    // We may eliminate PHI values, so recompute PHIKill flags.
-    VNI->setHasPHIKill(false);
     NewLI.addRange(LiveRange(VNI->def, VNI->def.getNextSlot(), VNI));
 
     // A use tied to an early-clobber def ends at the load slot and isn't caught
@@ -804,7 +816,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     SlotIndex BlockStart = getMBBStartIdx(MBB);
 
     // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = NewLI.extendInBlock(BlockStart, Idx)) {
+    if (VNInfo *ExtVNI = NewLI.extendInBlock(BlockStart, Idx.getNextSlot())) {
       (void)ExtVNI;
       assert(ExtVNI == VNI && "Unexpected existing value number");
       // Is this a PHIDef we haven't seen before?
@@ -813,13 +825,12 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       // The PHI is live, make sure the predecessors are live-out.
       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
            PE = MBB->pred_end(); PI != PE; ++PI) {
+        if (!LiveOut.insert(*PI))
+          continue;
         SlotIndex Stop = getMBBEndIdx(*PI).getPrevSlot();
-        VNInfo *PVNI = li->getVNInfoAt(Stop);
         // A predecessor is not required to have a live-out value for a PHI.
-        if (PVNI) {
-          PVNI->setHasPHIKill(true);
+        if (VNInfo *PVNI = li->getVNInfoAt(Stop))
           WorkList.push_back(std::make_pair(Stop, PVNI));
-        }
       }
       continue;
     }
@@ -831,6 +842,8 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     // Make sure VNI is live-out from the predecessors.
     for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
          PE = MBB->pred_end(); PI != PE; ++PI) {
+      if (!LiveOut.insert(*PI))
+        continue;
       SlotIndex Stop = getMBBEndIdx(*PI).getPrevSlot();
       assert(li->getVNInfoAt(Stop) == VNI && "Wrong value out of predecessor");
       WorkList.push_back(std::make_pair(Stop, VNI));
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index 70003e7..110fe1e 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -91,25 +91,6 @@ LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
   OS << '\n';
 }
 
-void LiveIntervalUnion::InterferenceResult::print(raw_ostream &OS,
-                                          const TargetRegisterInfo *TRI) const {
-  OS << '[' << start() << ';' << stop() << "):"
-     << PrintReg(interference()->reg, TRI);
-}
-
-void LiveIntervalUnion::Query::print(raw_ostream &OS,
-                                     const TargetRegisterInfo *TRI) {
-  OS << "Interferences with ";
-  LiveUnion->print(OS, TRI);
-  InterferenceResult IR = firstInterference();
-  while (isInterference(IR)) {
-    OS << "  ";
-    IR.print(OS, TRI);
-    OS << '\n';
-    nextInterference(IR);
-  }
-}
-
 #ifndef NDEBUG
 // Verify the live intervals in this union and add them to the visited set.
 void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
@@ -118,114 +99,6 @@ void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
 }
 #endif //!NDEBUG
 
-// Private interface accessed by Query.
-//
-// Find a pair of segments that intersect, one in the live virtual register
-// (LiveInterval), and the other in this LiveIntervalUnion. The caller (Query)
-// is responsible for advancing the LiveIntervalUnion segments to find a
-// "notable" intersection, which requires query-specific logic.
-//
-// This design assumes only a fast mechanism for intersecting a single live
-// virtual register segment with a set of LiveIntervalUnion segments.  This may
-// be ok since most virtual registers have very few segments.  If we had a data
-// structure that optimizd MxN intersection of segments, then we would bypass
-// the loop that advances within the LiveInterval.
-//
-// If no intersection exists, set VirtRegI = VirtRegEnd, and set SI to the first
-// segment whose start point is greater than LiveInterval's end point.
-//
-// Assumes that segments are sorted by start position in both
-// LiveInterval and LiveSegments.
-void LiveIntervalUnion::Query::findIntersection(InterferenceResult &IR) const {
-  // Search until reaching the end of the LiveUnion segments.
-  LiveInterval::iterator VirtRegEnd = VirtReg->end();
-  if (IR.VirtRegI == VirtRegEnd)
-    return;
-  while (IR.LiveUnionI.valid()) {
-    // Slowly advance the live virtual reg iterator until we surpass the next
-    // segment in LiveUnion.
-    //
-    // Note: If this is ever used for coalescing of fixed registers and we have
-    // a live vreg with thousands of segments, then change this code to use
-    // upperBound instead.
-    IR.VirtRegI = VirtReg->advanceTo(IR.VirtRegI, IR.LiveUnionI.start());
-    if (IR.VirtRegI == VirtRegEnd)
-      break; // Retain current (nonoverlapping) LiveUnionI
-
-    // VirtRegI may have advanced far beyond LiveUnionI, catch up.
-    IR.LiveUnionI.advanceTo(IR.VirtRegI->start);
-
-    // Check if no LiveUnionI exists with VirtRegI->Start < LiveUnionI.end
-    if (!IR.LiveUnionI.valid())
-      break;
-    if (IR.LiveUnionI.start() < IR.VirtRegI->end) {
-      assert(overlap(*IR.VirtRegI, IR.LiveUnionI) &&
-             "upperBound postcondition");
-      break;
-    }
-  }
-  if (!IR.LiveUnionI.valid())
-    IR.VirtRegI = VirtRegEnd;
-}
-
-// Find the first intersection, and cache interference info
-// (retain segment iterators into both VirtReg and LiveUnion).
-const LiveIntervalUnion::InterferenceResult &
-LiveIntervalUnion::Query::firstInterference() {
-  if (CheckedFirstInterference)
-    return FirstInterference;
-  CheckedFirstInterference = true;
-  InterferenceResult &IR = FirstInterference;
-  IR.LiveUnionI.setMap(LiveUnion->getMap());
-
-  // Quickly skip interference check for empty sets.
-  if (VirtReg->empty() || LiveUnion->empty()) {
-    IR.VirtRegI = VirtReg->end();
-  } else if (VirtReg->beginIndex() < LiveUnion->startIndex()) {
-    // VirtReg starts first, perform double binary search.
-    IR.VirtRegI = VirtReg->find(LiveUnion->startIndex());
-    if (IR.VirtRegI != VirtReg->end())
-      IR.LiveUnionI.find(IR.VirtRegI->start);
-  } else {
-    // LiveUnion starts first, perform double binary search.
-    IR.LiveUnionI.find(VirtReg->beginIndex());
-    if (IR.LiveUnionI.valid())
-      IR.VirtRegI = VirtReg->find(IR.LiveUnionI.start());
-    else
-      IR.VirtRegI = VirtReg->end();
-  }
-  findIntersection(FirstInterference);
-  assert((IR.VirtRegI == VirtReg->end() || IR.LiveUnionI.valid())
-         && "Uninitialized iterator");
-  return FirstInterference;
-}
-
-// Treat the result as an iterator and advance to the next interfering pair
-// of segments. This is a plain iterator with no filter.
-bool LiveIntervalUnion::Query::nextInterference(InterferenceResult &IR) const {
-  assert(isInterference(IR) && "iteration past end of interferences");
-
-  // Advance either the VirtReg or LiveUnion segment to ensure that we visit all
-  // unique overlapping pairs.
-  if (IR.VirtRegI->end < IR.LiveUnionI.stop()) {
-    if (++IR.VirtRegI == VirtReg->end())
-      return false;
-  }
-  else {
-    if (!(++IR.LiveUnionI).valid()) {
-      IR.VirtRegI = VirtReg->end();
-      return false;
-    }
-  }
-  // Short-circuit findIntersection() if possible.
-  if (overlap(*IR.VirtRegI, IR.LiveUnionI))
-    return true;
-
-  // Find the next intersection.
-  findIntersection(IR);
-  return isInterference(IR);
-}
-
 // Scan the vector of interfering virtual registers in this union. Assume it's
 // quite small.
 bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
@@ -234,64 +107,75 @@ bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
   return I != InterferingVRegs.end();
 }
 
-// Count the number of virtual registers in this union that interfere with this
+// Collect virtual registers in this union that interfere with this
 // query's live virtual register.
 //
-// The number of times that we either advance IR.VirtRegI or call
-// LiveUnion.upperBound() will be no more than the number of holes in
-// VirtReg. So each invocation of collectInterferingVRegs() takes
-// time proportional to |VirtReg Holes| * time(LiveUnion.upperBound()).
+// The query state is one of:
+//
+// 1. CheckedFirstInterference == false: Iterators are uninitialized.
+// 2. SeenAllInterferences == true: InterferingVRegs complete, iterators unused.
+// 3. Iterators left at the last seen intersection.
 //
-// For comments on how to speed it up, see Query::findIntersection().
 unsigned LiveIntervalUnion::Query::
 collectInterferingVRegs(unsigned MaxInterferingRegs) {
-  InterferenceResult IR = firstInterference();
-  LiveInterval::iterator VirtRegEnd = VirtReg->end();
-  LiveInterval *RecentInterferingVReg = NULL;
-  if (IR.VirtRegI != VirtRegEnd) while (IR.LiveUnionI.valid()) {
-    // Advance the union's iterator to reach an unseen interfering vreg.
-    do {
-      if (IR.LiveUnionI.value() == RecentInterferingVReg)
-        continue;
+  // Fast path return if we already have the desired information.
+  if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs)
+    return InterferingVRegs.size();
+
+  // Set up iterators on the first call.
+  if (!CheckedFirstInterference) {
+    CheckedFirstInterference = true;
+
+    // Quickly skip interference check for empty sets.
+    if (VirtReg->empty() || LiveUnion->empty()) {
+      SeenAllInterferences = true;
+      return 0;
+    }
 
-      if (!isSeenInterference(IR.LiveUnionI.value()))
-        break;
+    // In most cases, the union will start before VirtReg.
+    VirtRegI = VirtReg->begin();
+    LiveUnionI.setMap(LiveUnion->getMap());
+    LiveUnionI.find(VirtRegI->start);
+  }
 
-      // Cache the most recent interfering vreg to bypass isSeenInterference.
-      RecentInterferingVReg = IR.LiveUnionI.value();
+  LiveInterval::iterator VirtRegEnd = VirtReg->end();
+  LiveInterval *RecentReg = 0;
+  while (LiveUnionI.valid()) {
+    assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg");
+
+    // Check for overlapping interference.
+    while (VirtRegI->start < LiveUnionI.stop() &&
+           VirtRegI->end > LiveUnionI.start()) {
+      // This is an overlap, record the interfering register.
+      LiveInterval *VReg = LiveUnionI.value();
+      if (VReg != RecentReg && !isSeenInterference(VReg)) {
+        RecentReg = VReg;
+        InterferingVRegs.push_back(VReg);
+        if (InterferingVRegs.size() >= MaxInterferingRegs)
+          return InterferingVRegs.size();
+      }
+      // This LiveUnion segment is no longer interesting.
+      if (!(++LiveUnionI).valid()) {
+        SeenAllInterferences = true;
+        return InterferingVRegs.size();
+      }
+    }
 
-    } while ((++IR.LiveUnionI).valid());
-    if (!IR.LiveUnionI.valid())
-      break;
+    // The iterators are now not overlapping, LiveUnionI has been advanced
+    // beyond VirtRegI.
+    assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap");
 
-    // Advance the VirtReg iterator until surpassing the next segment in
-    // LiveUnion.
-    IR.VirtRegI = VirtReg->advanceTo(IR.VirtRegI, IR.LiveUnionI.start());
-    if (IR.VirtRegI == VirtRegEnd)
+    // Advance the iterator that ends first.
+    VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start());
+    if (VirtRegI == VirtRegEnd)
       break;
 
-    // Check for intersection with the union's segment.
-    if (overlap(*IR.VirtRegI, IR.LiveUnionI)) {
-
-      if (!IR.LiveUnionI.value()->isSpillable())
-        SeenUnspillableVReg = true;
-
-      if (InterferingVRegs.size() == MaxInterferingRegs)
-        // Leave SeenAllInterferences set to false to indicate that at least one
-        // interference exists beyond those we collected.
-        return MaxInterferingRegs;
-
-      InterferingVRegs.push_back(IR.LiveUnionI.value());
-
-      // Cache the most recent interfering vreg to bypass isSeenInterference.
-      RecentInterferingVReg = IR.LiveUnionI.value();
-      ++IR.LiveUnionI;
-
+    // Detect overlap, handle above.
+    if (VirtRegI->start < LiveUnionI.stop())
       continue;
-    }
-    // VirtRegI may have advanced far beyond LiveUnionI,
-    // do a fast intersection test to "catch up"
-    IR.LiveUnionI.advanceTo(IR.VirtRegI->start);
+
+    // Still not overlapping. Catch up LiveUnionI.
+    LiveUnionI.advanceTo(VirtRegI->start);
   }
   SeenAllInterferences = true;
   return InterferingVRegs.size();
diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h
index 5e78d5e..5d64d28 100644
--- a/lib/CodeGen/LiveIntervalUnion.h
+++ b/lib/CodeGen/LiveIntervalUnion.h
@@ -59,7 +59,6 @@ public:
   // LiveIntervalUnions share an external allocator.
   typedef LiveSegments::Allocator Allocator;
 
-  class InterferenceResult;
   class Query;
 
 private:
@@ -106,62 +105,13 @@ public:
   void verify(LiveVirtRegBitSet& VisitedVRegs);
 #endif
 
-  /// Cache a single interference test result in the form of two intersecting
-  /// segments. This allows efficiently iterating over the interferences. The
-  /// iteration logic is handled by LiveIntervalUnion::Query which may
-  /// filter interferences depending on the type of query.
-  class InterferenceResult {
-    friend class Query;
-
-    LiveInterval::iterator VirtRegI; // current position in VirtReg
-    SegmentIter LiveUnionI;          // current position in LiveUnion
-
-    // Internal ctor.
-    InterferenceResult(LiveInterval::iterator VRegI, SegmentIter UnionI)
-      : VirtRegI(VRegI), LiveUnionI(UnionI) {}
-
-  public:
-    // Public default ctor.
-    InterferenceResult(): VirtRegI(), LiveUnionI() {}
-
-    /// start - Return the start of the current overlap.
-    SlotIndex start() const {
-      return std::max(VirtRegI->start, LiveUnionI.start());
-    }
-
-    /// stop - Return the end of the current overlap.
-    SlotIndex stop() const {
-      return std::min(VirtRegI->end, LiveUnionI.stop());
-    }
-
-    /// interference - Return the register that is interfering here.
-    LiveInterval *interference() const { return LiveUnionI.value(); }
-
-    // Note: this interface provides raw access to the iterators because the
-    // result has no way to tell if it's valid to dereference them.
-
-    // Access the VirtReg segment.
-    LiveInterval::iterator virtRegPos() const { return VirtRegI; }
-
-    // Access the LiveUnion segment.
-    const SegmentIter &liveUnionPos() const { return LiveUnionI; }
-
-    bool operator==(const InterferenceResult &IR) const {
-      return VirtRegI == IR.VirtRegI && LiveUnionI == IR.LiveUnionI;
-    }
-    bool operator!=(const InterferenceResult &IR) const {
-      return !operator==(IR);
-    }
-
-    void print(raw_ostream &OS, const TargetRegisterInfo *TRI) const;
-  };
-
   /// Query interferences between a single live virtual register and a live
   /// interval union.
   class Query {
     LiveIntervalUnion *LiveUnion;
     LiveInterval *VirtReg;
-    InterferenceResult FirstInterference;
+    LiveInterval::iterator VirtRegI; // current position in VirtReg
+    SegmentIter LiveUnionI;          // current position in LiveUnion
     SmallVector<LiveInterval*,4> InterferingVRegs;
     bool CheckedFirstInterference;
     bool SeenAllInterferences;
@@ -206,26 +156,8 @@ public:
       return *VirtReg;
     }
 
-    bool isInterference(const InterferenceResult &IR) const {
-      if (IR.VirtRegI != VirtReg->end()) {
-        assert(overlap(*IR.VirtRegI, IR.LiveUnionI) &&
-               "invalid segment iterators");
-        return true;
-      }
-      return false;
-    }
-
     // Does this live virtual register interfere with the union?
-    bool checkInterference() { return isInterference(firstInterference()); }
-
-    // Get the first pair of interfering segments, or a noninterfering result.
-    // This initializes the firstInterference_ cache.
-    const InterferenceResult &firstInterference();
-
-    // Treat the result as an iterator and advance to the next interfering pair
-    // of segments. Visiting each unique interfering pairs means that the same
-    // VirtReg or LiveUnion segment may be visited multiple times.
-    bool nextInterference(InterferenceResult &IR) const;
+    bool checkInterference() { return collectInterferingVRegs(1); }
 
     // Count the virtual registers in this union that interfere with this
     // query's live virtual register, up to maxInterferingRegs.
@@ -249,13 +181,9 @@ public:
     /// Loop.
     bool checkLoopInterference(MachineLoopRange*);
 
-    void print(raw_ostream &OS, const TargetRegisterInfo *TRI);
   private:
     Query(const Query&);          // DO NOT IMPLEMENT
     void operator=(const Query&); // DO NOT IMPLEMENT
-
-    // Private interface for queries
-    void findIntersection(InterferenceResult &IR) const;
   };
 };
 
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
new file mode 100644
index 0000000..a7d5af5
--- /dev/null
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -0,0 +1,270 @@
+//===---- LiveRangeCalc.cpp - Calculate live ranges -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the LiveRangeCalc class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "regalloc"
+#include "LiveRangeCalc.h"
+#include "llvm/CodeGen/MachineDominators.h"
+
+using namespace llvm;
+
+void LiveRangeCalc::reset(const MachineFunction *MF) {
+  unsigned N = MF->getNumBlockIDs();
+  Seen.clear();
+  Seen.resize(N);
+  LiveOut.resize(N);
+  LiveIn.clear();
+}
+
+
+// Transfer information from the LiveIn vector to the live ranges.
+void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI, SlotIndexes *Indexes) {
+  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
+         E = LiveIn.end(); I != E; ++I) {
+    if (!I->DomNode)
+      continue;
+    MachineBasicBlock *MBB = I->DomNode->getBlock();
+
+    VNInfo *VNI = OverrideVNI ? OverrideVNI : I->Value;
+    assert(VNI && "No live-in value found");
+
+    SlotIndex Start, End;
+    tie(Start, End) = Indexes->getMBBRange(MBB);
+
+    if (I->Kill.isValid())
+      I->LI->addRange(LiveRange(Start, I->Kill, VNI));
+    else {
+      I->LI->addRange(LiveRange(Start, End, VNI));
+      // The value is live-through, update LiveOut as well.  Defer the Domtree
+      // lookup until it is needed.
+      assert(Seen.test(MBB->getNumber()));
+      LiveOut[MBB] = LiveOutPair(VNI, (MachineDomTreeNode *)0);
+    }
+  }
+  LiveIn.clear();
+}
+
+
+void LiveRangeCalc::extend(LiveInterval *LI,
+                           SlotIndex Kill,
+                           SlotIndexes *Indexes,
+                           MachineDominatorTree *DomTree,
+                           VNInfo::Allocator *Alloc) {
+  assert(LI && "Missing live range");
+  assert(Kill.isValid() && "Invalid SlotIndex");
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill.getPrevSlot());
+  assert(Kill && "No MBB at Kill");
+
+  // Is there a def in the same MBB we can extend?
+  if (LI->extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
+    return;
+
+  // Find the single reaching def, or determine if Kill is jointly dominated by
+  // multiple values, and we may need to create even more phi-defs to preserve
+  // VNInfo SSA form.  Perform a search for all predecessor blocks where we
+  // know the dominating VNInfo.
+  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, Indexes, DomTree);
+
+  // When there were multiple different values, we may need new PHIs.
+  if (!VNI)
+    updateSSA(Indexes, DomTree, Alloc);
+
+  updateLiveIns(VNI, Indexes);
+}
+
+
+// This function is called by a client after using the low-level API to add
+// live-out and live-in blocks.  The unique value optimization is not
+// available, SplitEditor::transferValues handles that case directly anyway.
+void LiveRangeCalc::calculateValues(SlotIndexes *Indexes,
+                                    MachineDominatorTree *DomTree,
+                                    VNInfo::Allocator *Alloc) {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+  updateSSA(Indexes, DomTree, Alloc);
+  updateLiveIns(0, Indexes);
+}
+
+
+VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
+                                        MachineBasicBlock *KillMBB,
+                                        SlotIndex Kill,
+                                        SlotIndexes *Indexes,
+                                        MachineDominatorTree *DomTree) {
+  // Blocks where LI should be live-in.
+  SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB);
+
+  // Remember if we have seen more than one value.
+  bool UniqueVNI = true;
+  VNInfo *TheVNI = 0;
+
+  // Using Seen as a visited set, perform a BFS for all reaching defs.
+  for (unsigned i = 0; i != WorkList.size(); ++i) {
+    MachineBasicBlock *MBB = WorkList[i];
+    assert(!MBB->pred_empty() && "Value live-in to entry block?");
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+           PE = MBB->pred_end(); PI != PE; ++PI) {
+       MachineBasicBlock *Pred = *PI;
+
+       // Is this a known live-out block?
+       if (Seen.test(Pred->getNumber())) {
+         if (VNInfo *VNI = LiveOut[Pred].first) {
+           if (TheVNI && TheVNI != VNI)
+             UniqueVNI = false;
+           TheVNI = VNI;
+         }
+         continue;
+       }
+
+       SlotIndex Start, End;
+       tie(Start, End) = Indexes->getMBBRange(Pred);
+
+       // First time we see Pred.  Try to determine the live-out value, but set
+       // it as null if Pred is live-through with an unknown value.
+       VNInfo *VNI = LI->extendInBlock(Start, End);
+       setLiveOutValue(Pred, VNI);
+       if (VNI) {
+         if (TheVNI && TheVNI != VNI)
+           UniqueVNI = false;
+         TheVNI = VNI;
+         continue;
+       }
+
+       // No, we need a live-in value for Pred as well
+       if (Pred != KillMBB)
+          WorkList.push_back(Pred);
+       else
+          // Loopback to KillMBB, so value is really live through.
+         Kill = SlotIndex();
+    }
+  }
+
+  // Transfer WorkList to LiveInBlocks in reverse order.
+  // This ordering works best with updateSSA().
+  LiveIn.clear();
+  LiveIn.reserve(WorkList.size());
+  while(!WorkList.empty())
+    addLiveInBlock(LI, DomTree->getNode(WorkList.pop_back_val()));
+
+  // The kill block may not be live-through.
+  assert(LiveIn.back().DomNode->getBlock() == KillMBB);
+  LiveIn.back().Kill = Kill;
+
+  return UniqueVNI ? TheVNI : 0;
+}
+
+
+// This is essentially the same iterative algorithm that SSAUpdater uses,
+// except we already have a dominator tree, so we don't have to recompute it.
+void LiveRangeCalc::updateSSA(SlotIndexes *Indexes,
+                              MachineDominatorTree *DomTree,
+                              VNInfo::Allocator *Alloc) {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  // Interate until convergence.
+  unsigned Changes;
+  do {
+    Changes = 0;
+    // Propagate live-out values down the dominator tree, inserting phi-defs
+    // when necessary.
+    for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
+           E = LiveIn.end(); I != E; ++I) {
+      MachineDomTreeNode *Node = I->DomNode;
+      // Skip block if the live-in value has already been determined.
+      if (!Node)
+        continue;
+      MachineBasicBlock *MBB = Node->getBlock();
+      MachineDomTreeNode *IDom = Node->getIDom();
+      LiveOutPair IDomValue;
+
+      // We need a live-in value to a block with no immediate dominator?
+      // This is probably an unreachable block that has survived somehow.
+      bool needPHI = !IDom || !Seen.test(IDom->getBlock()->getNumber());
+
+      // IDom dominates all of our predecessors, but it may not be their
+      // immediate dominator. Check if any of them have live-out values that are
+      // properly dominated by IDom. If so, we need a phi-def here.
+      if (!needPHI) {
+        IDomValue = LiveOut[IDom->getBlock()];
+
+        // Cache the DomTree node that defined the value.
+        if (IDomValue.first && !IDomValue.second)
+          LiveOut[IDom->getBlock()].second = IDomValue.second =
+            DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
+
+        for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+               PE = MBB->pred_end(); PI != PE; ++PI) {
+          LiveOutPair &Value = LiveOut[*PI];
+          if (!Value.first || Value.first == IDomValue.first)
+            continue;
+
+          // Cache the DomTree node that defined the value.
+          if (!Value.second)
+            Value.second =
+              DomTree->getNode(Indexes->getMBBFromIndex(Value.first->def));
+
+          // This predecessor is carrying something other than IDomValue.
+          // It could be because IDomValue hasn't propagated yet, or it could be
+          // because MBB is in the dominance frontier of that value.
+          if (DomTree->dominates(IDom, Value.second)) {
+            needPHI = true;
+            break;
+          }
+        }
+      }
+
+      // The value may be live-through even if Kill is set, as can happen when
+      // we are called from extendRange. In that case LiveOutSeen is true, and
+      // LiveOut indicates a foreign or missing value.
+      LiveOutPair &LOP = LiveOut[MBB];
+
+      // Create a phi-def if required.
+      if (needPHI) {
+        ++Changes;
+        assert(Alloc && "Need VNInfo allocator to create PHI-defs");
+        SlotIndex Start, End;
+        tie(Start, End) = Indexes->getMBBRange(MBB);
+        VNInfo *VNI = I->LI->getNextValue(Start, 0, *Alloc);
+        VNI->setIsPHIDef(true);
+        I->Value = VNI;
+        // This block is done, we know the final value.
+        I->DomNode = 0;
+
+        // Add liveness since updateLiveIns now skips this node.
+        if (I->Kill.isValid())
+          I->LI->addRange(LiveRange(Start, I->Kill, VNI));
+        else {
+          I->LI->addRange(LiveRange(Start, End, VNI));
+          LOP = LiveOutPair(VNI, Node);
+        }
+      } else if (IDomValue.first) {
+        // No phi-def here. Remember incoming value.
+        I->Value = IDomValue.first;
+
+        // If the IDomValue is killed in the block, don't propagate through.
+        if (I->Kill.isValid())
+          continue;
+
+        // Propagate IDomValue if it isn't killed:
+        // MBB is live-out and doesn't define its own value.
+        if (LOP.first == IDomValue.first)
+          continue;
+        ++Changes;
+        LOP = IDomValue;
+      }
+    }
+  } while (Changes);
+}
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
new file mode 100644
index 0000000..b8c8585
--- /dev/null
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -0,0 +1,226 @@
+//===---- LiveRangeCalc.h - Calculate live ranges ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeCalc class can be used to compute live ranges from scratch.  It
+// caches information about values in the CFG to speed up repeated operations
+// on the same live range.  The cache can be shared by non-overlapping live
+// ranges.  SplitKit uses that when computing the live range of split products.
+//
+// A low-level interface is available to clients that know where a variable is
+// live, but don't know which value it has as every point.  LiveRangeCalc will
+// propagate values down the dominator tree, and even insert PHI-defs where
+// needed.  SplitKit uses this faster interface when possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_LIVERANGECALC_H
+#define LLVM_CODEGEN_LIVERANGECALC_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+
+namespace llvm {
+
+/// Forward declarations for MachineDominators.h:
+class MachineDominatorTree;
+template <class NodeT> class DomTreeNodeBase;
+typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
+
+class LiveRangeCalc {
+  /// Seen - Bit vector of active entries in LiveOut, also used as a visited
+  /// set by findReachingDefs.  One entry per basic block, indexed by block
+  /// number.  This is kept as a separate bit vector because it can be cleared
+  /// quickly when switching live ranges.
+  BitVector Seen;
+
+  /// LiveOutPair - A value and the block that defined it.  The domtree node is
+  /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
+  typedef std::pair<VNInfo*, MachineDomTreeNode*> LiveOutPair;
+
+  /// LiveOutMap - Map basic blocks to the value leaving the block.
+  typedef IndexedMap<LiveOutPair, MBB2NumberFunctor> LiveOutMap;
+
+  /// LiveOut - Map each basic block where a live range is live out to the
+  /// live-out value and its defining block.
+  ///
+  /// For every basic block, MBB, one of these conditions shall be true:
+  ///
+  ///  1. !Seen.count(MBB->getNumber())
+  ///     Blocks without a Seen bit are ignored.
+  ///  2. LiveOut[MBB].second.getNode() == MBB
+  ///     The live-out value is defined in MBB.
+  ///  3. forall P in preds(MBB): LiveOut[P] == LiveOut[MBB]
+  ///     The live-out value passses through MBB. All predecessors must carry
+  ///     the same value.
+  ///
+  /// The domtree node may be null, it can be computed.
+  ///
+  /// The map can be shared by multiple live ranges as long as no two are
+  /// live-out of the same block.
+  LiveOutMap LiveOut;
+
+  /// LiveInBlock - Information about a basic block where a live range is known
+  /// to be live-in, but the value has not yet been determined.
+  struct LiveInBlock {
+    // LI - The live range that is live-in to this block.  The algorithms can
+    // handle multiple non-overlapping live ranges simultaneously.
+    LiveInterval *LI;
+
+    // DomNode - Dominator tree node for the block.
+    // Cleared when the final value has been determined and LI has been updated.
+    MachineDomTreeNode *DomNode;
+
+    // Position in block where the live-in range ends, or SlotIndex() if the
+    // range passes through the block.  When the final value has been
+    // determined, the range from the block start to Kill will be added to LI.
+    SlotIndex Kill;
+
+    // Live-in value filled in by updateSSA once it is known.
+    VNInfo *Value;
+
+    LiveInBlock(LiveInterval *li, MachineDomTreeNode *node, SlotIndex kill)
+      : LI(li), DomNode(node), Kill(kill), Value(0) {}
+  };
+
+  /// LiveIn - Work list of blocks where the live-in value has yet to be
+  /// determined.  This list is typically computed by findReachingDefs() and
+  /// used as a work list by updateSSA().  The low-level interface may also be
+  /// used to add entries directly.
+  SmallVector<LiveInBlock, 16> LiveIn;
+
+  /// findReachingDefs - Assuming that LI is live-in to KillMBB and killed at
+  /// Kill, search for values that can reach KillMBB.  All blocks that need LI
+  /// to be live-in are added to LiveIn.  If a unique reaching def is found,
+  /// its value is returned, if Kill is jointly dominated by multiple values,
+  /// NULL is returned.
+  VNInfo *findReachingDefs(LiveInterval *LI,
+                           MachineBasicBlock *KillMBB,
+                           SlotIndex Kill,
+                           SlotIndexes *Indexes,
+                           MachineDominatorTree *DomTree);
+
+  /// updateSSA - Compute the values that will be live in to all requested
+  /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
+  ///
+  /// Every live-in block must be jointly dominated by the added live-out
+  /// blocks.  No values are read from the live ranges.
+  void updateSSA(SlotIndexes *Indexes,
+                 MachineDominatorTree *DomTree,
+                 VNInfo::Allocator *Alloc);
+
+  /// updateLiveIns - Add liveness as specified in the LiveIn vector, using VNI
+  /// as a wildcard value for LiveIn entries without a value.
+  void updateLiveIns(VNInfo *VNI, SlotIndexes*);
+
+public:
+  //===--------------------------------------------------------------------===//
+  // High-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Calculate live ranges from scratch.
+  //
+
+  /// reset - Prepare caches for a new set of non-overlapping live ranges.  The
+  /// caches must be reset before attempting calculations with a live range
+  /// that may overlap a previously computed live range, and before the first
+  /// live range in a function.  If live ranges are not known to be
+  /// non-overlapping, call reset before each.
+  void reset(const MachineFunction *MF);
+
+  /// calculate - Calculate the live range of a virtual register from its defs
+  /// and uses.  LI must be empty with no values.
+  void calculate(LiveInterval *LI,
+                 MachineRegisterInfo *MRI,
+                 SlotIndexes *Indexes,
+                 VNInfo::Allocator *Alloc);
+
+  //===--------------------------------------------------------------------===//
+  // Mid-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Modify existing live ranges.
+  //
+
+  /// extend - Extend the live range of LI to reach Kill.
+  ///
+  /// The existing values in LI must be live so they jointly dominate Kill.  If
+  /// Kill is not dominated by a single existing value, PHI-defs are inserted
+  /// as required to preserve SSA form.  If Kill is known to be dominated by a
+  /// single existing value, Alloc may be null.
+  void extend(LiveInterval *LI,
+              SlotIndex Kill,
+              SlotIndexes *Indexes,
+              MachineDominatorTree *DomTree,
+              VNInfo::Allocator *Alloc);
+
+  /// extendToUses - Extend the live range of LI to reach all uses.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveInterval *LI,
+                    MachineRegisterInfo *MRI,
+                    SlotIndexes *Indexes,
+                    MachineDominatorTree *DomTree,
+                    VNInfo::Allocator *Alloc);
+
+  //===--------------------------------------------------------------------===//
+  // Low-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // These functions can be used to compute live ranges where the live-in and
+  // live-out blocks are already known, but the SSA value in each block is
+  // unknown.
+  //
+  // After calling reset(), add known live-out values and known live-in blocks.
+  // Then call calculateValues() to compute the actual value that is
+  // live-in to each block, and add liveness to the live ranges.
+  //
+
+  /// setLiveOutValue - Indicate that VNI is live out from MBB.  The
+  /// calculateValues() function will not add liveness for MBB, the caller
+  /// should take care of that.
+  ///
+  /// VNI may be null only if MBB is a live-through block also passed to
+  /// addLiveInBlock().
+  void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
+    Seen.set(MBB->getNumber());
+    LiveOut[MBB] = LiveOutPair(VNI, (MachineDomTreeNode *)0);
+  }
+
+  /// addLiveInBlock - Add a block with an unknown live-in value.  This
+  /// function can only be called once per basic block.  Once the live-in value
+  /// has been determined, calculateValues() will add liveness to LI.
+  ///
+  /// @param LI      The live range that is live-in to the block.
+  /// @param DomNode The domtree node for the block.
+  /// @param Kill    Index in block where LI is killed.  If the value is
+  ///                live-through, set Kill = SLotIndex() and also call
+  ///                setLiveOutValue(MBB, 0).
+  void addLiveInBlock(LiveInterval *LI,
+                      MachineDomTreeNode *DomNode,
+                      SlotIndex Kill = SlotIndex()) {
+    LiveIn.push_back(LiveInBlock(LI, DomNode, Kill));
+  }
+
+  /// calculateValues - Calculate the value that will be live-in to each block
+  /// added with addLiveInBlock.  Add PHI-def values as needed to preserve SSA
+  /// form.  Add liveness to all live-in blocks up to the Kill point, or the
+  /// whole block for live-through blocks.
+  ///
+  /// Every predecessor of a live-in block must have been given a value with
+  /// setLiveOutValue, the value may be null for live-trough blocks.
+  void calculateValues(SlotIndexes *Indexes,
+                       MachineDominatorTree *DomTree,
+                       VNInfo::Allocator *Alloc);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index b385fb3..b23f851 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -319,9 +319,12 @@ void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
                                              LiveIntervals &LIS,
                                              const MachineLoopInfo &Loops) {
   VirtRegAuxInfo VRAI(MF, LIS, Loops);
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   for (iterator I = begin(), E = end(); I != E; ++I) {
     LiveInterval &LI = **I;
-    VRAI.CalculateRegClass(LI.reg);
+    if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
+      DEBUG(dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
+                   << MRI.getRegClass(LI.reg)->getName() << '\n');
     VRAI.CalculateWeightAndHint(LI);
   }
 }
diff --git a/lib/CodeGen/LiveRangeEdit.h b/lib/CodeGen/LiveRangeEdit.h
index db6740c..9b0a671 100644
--- a/lib/CodeGen/LiveRangeEdit.h
+++ b/lib/CodeGen/LiveRangeEdit.h
@@ -115,7 +115,7 @@ public:
   LiveInterval *get(unsigned idx) const { return newRegs_[idx+firstNew_]; }
 
   ArrayRef<LiveInterval*> regs() const {
-    return ArrayRef<LiveInterval*>(newRegs_).slice(firstNew_);
+    return makeArrayRef(newRegs_).slice(firstNew_);
   }
 
   /// FIXME: Temporary accessors until we can get rid of
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index c75196a..939e795 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -44,7 +44,8 @@ void LiveStacks::releaseMemory() {
   S2RCMap.clear();
 }
 
-bool LiveStacks::runOnMachineFunction(MachineFunction &) {
+bool LiveStacks::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
   // FIXME: No analysis is being done right now. We are relying on the
   // register allocators to provide the information.
   return false;
@@ -61,7 +62,7 @@ LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) {
   } else {
     // Use the largest common subclass register class.
     const TargetRegisterClass *OldRC = S2RCMap[Slot];
-    S2RCMap[Slot] = getCommonSubClass(OldRC, RC);
+    S2RCMap[Slot] = TRI->getCommonSubClass(OldRC, RC);
   }
   return I->second;
 }
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 20bad60..2ca90f9 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -662,7 +662,7 @@ void LiveVariables::removeVirtualRegistersKilled(MachineInstr *MI) {
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         bool removed = getVarInfo(Reg).removeKill(MI);
         assert(removed && "kill not in register's VarInfo?");
-        removed = true;
+        (void)removed;
       }
     }
   }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 8f0fb46..4c5fe4c 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -571,6 +571,11 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
       if (i->getOperand(ni+1).getMBB() == this)
         i->getOperand(ni+1).setMBB(NMBB);
 
+  // Inherit live-ins from the successor
+  for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(),
+	 E = Succ->livein_end(); I != E; ++I)
+    NMBB->addLiveIn(*I);
+
   // Update LiveVariables.
   if (LV) {
     // Restore kills of virtual registers that were killed by the terminators.
diff --git a/lib/CodeGen/MachineBlockFrequency.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 893a320..b92cda9 100644
--- a/lib/CodeGen/MachineBlockFrequency.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -1,4 +1,4 @@
-//====----- MachineBlockFrequency.cpp - Machine Block Frequency Analysis ----====//
+//====----- MachineBlockFrequencyInfo.cpp - Machine Block Frequency Analysis ----====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,47 +13,49 @@
 
 #include "llvm/InitializePasses.h"
 #include "llvm/Analysis/BlockFrequencyImpl.h"
-#include "llvm/CodeGen/MachineBlockFrequency.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(MachineBlockFrequency, "machine-block-freq",
+INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
                       "Machine Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_END(MachineBlockFrequency, "machine-block-freq",
+INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
                     "Machine Block Frequency Analysis", true, true)
 
-char MachineBlockFrequency::ID = 0;
+char MachineBlockFrequencyInfo::ID = 0;
 
 
-MachineBlockFrequency::MachineBlockFrequency() : MachineFunctionPass(ID) {
-  initializeMachineBlockFrequencyPass(*PassRegistry::getPassRegistry());
+MachineBlockFrequencyInfo::MachineBlockFrequencyInfo() : MachineFunctionPass(ID) {
+  initializeMachineBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
   MBFI = new BlockFrequencyImpl<MachineBasicBlock, MachineFunction,
                                 MachineBranchProbabilityInfo>();
 }
 
-MachineBlockFrequency::~MachineBlockFrequency() {
+MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() {
   delete MBFI;
 }
 
-void MachineBlockFrequency::getAnalysisUsage(AnalysisUsage &AU) const {
+void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
   AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool MachineBlockFrequency::runOnMachineFunction(MachineFunction &F) {
+bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
   MachineBranchProbabilityInfo &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
   MBFI->doFunction(&F, &MBPI);
   return false;
 }
 
-/// getblockFreq - Return block frequency. Never return 0, value must be
-/// positive. Please note that initial frequency is equal to 1024. It means that
-/// we should not rely on the value itself, but only on the comparison to the
-/// other block frequencies. We do this to avoid using of floating points.
+/// getblockFreq - Return block frequency. Return 0 if we don't have the
+/// information. Please note that initial frequency is equal to 1024. It means
+/// that we should not rely on the value itself, but only on the comparison to
+/// the other block frequencies. We do this to avoid using of floating points.
 ///
-uint32_t MachineBlockFrequency::getBlockFreq(MachineBasicBlock *MBB) {
+BlockFrequency MachineBlockFrequencyInfo::
+getBlockFreq(MachineBasicBlock *MBB) const {
   return MBFI->getBlockFreq(MBB);
 }
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 3a60a37..7eda8c1 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -430,13 +430,24 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       unsigned NewReg = CSMI->getOperand(i).getReg();
       if (OldReg == NewReg)
         continue;
+
       assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
              TargetRegisterInfo::isVirtualRegister(NewReg) &&
              "Do not CSE physical register defs!");
+
       if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
         DoCSE = false;
         break;
       }
+
+      // Don't perform CSE if the result of the old instruction cannot exist
+      // within the register class of the new instruction.
+      const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg);
+      if (!MRI->constrainRegClass(NewReg, OldRC)) {
+        DoCSE = false;
+        break;
+      }
+
       CSEPairs.push_back(std::make_pair(OldReg, NewReg));
       --NumDefs;
     }
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index cd25156..20066a0 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -619,7 +619,7 @@ void MachineJumpTableInfo::dump() const { print(dbgs()); }
 //  MachineConstantPool implementation
 //===----------------------------------------------------------------------===//
 
-const Type *MachineConstantPoolEntry::getType() const {
+Type *MachineConstantPoolEntry::getType() const {
   if (isMachineConstantPoolEntry())
     return Val.MachineCPVal->getType();
   return Val.ConstVal->getType();
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 143a29b..a240667 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -51,7 +51,7 @@ using namespace llvm;
 /// explicitly nulled out.
 void MachineOperand::AddRegOperandToRegInfo(MachineRegisterInfo *RegInfo) {
   assert(isReg() && "Can only add reg operand to use lists");
-  
+
   // If the reginfo pointer is null, just explicitly null out or next/prev
   // pointers, to ensure they are not garbage.
   if (RegInfo == 0) {
@@ -59,23 +59,23 @@ void MachineOperand::AddRegOperandToRegInfo(MachineRegisterInfo *RegInfo) {
     Contents.Reg.Next = 0;
     return;
   }
-  
+
   // Otherwise, add this operand to the head of the registers use/def list.
   MachineOperand **Head = &RegInfo->getRegUseDefListHead(getReg());
-  
+
   // For SSA values, we prefer to keep the definition at the start of the list.
   // we do this by skipping over the definition if it is at the head of the
   // list.
   if (*Head && (*Head)->isDef())
     Head = &(*Head)->Contents.Reg.Next;
-  
+
   Contents.Reg.Next = *Head;
   if (Contents.Reg.Next) {
     assert(getReg() == Contents.Reg.Next->getReg() &&
            "Different regs on the same list!");
     Contents.Reg.Next->Contents.Reg.Prev = &Contents.Reg.Next;
   }
-  
+
   Contents.Reg.Prev = Head;
   *Head = this;
 }
@@ -86,7 +86,7 @@ void MachineOperand::RemoveRegOperandFromRegInfo() {
   assert(isOnRegUseList() && "Reg operand is not on a use list");
   // Unlink this from the doubly linked list of operands.
   MachineOperand *NextOp = Contents.Reg.Next;
-  *Contents.Reg.Prev = NextOp; 
+  *Contents.Reg.Prev = NextOp;
   if (NextOp) {
     assert(NextOp->getReg() == getReg() && "Corrupt reg use/def chain!");
     NextOp->Contents.Reg.Prev = Contents.Reg.Prev;
@@ -97,7 +97,7 @@ void MachineOperand::RemoveRegOperandFromRegInfo() {
 
 void MachineOperand::setReg(unsigned Reg) {
   if (getReg() == Reg) return; // No change.
-  
+
   // Otherwise, we have to change the register.  If this operand is embedded
   // into a machine function, we need to update the old and new register's
   // use/def lists.
@@ -109,7 +109,7 @@ void MachineOperand::setReg(unsigned Reg) {
         AddRegOperandToRegInfo(&MF->getRegInfo());
         return;
       }
-        
+
   // Otherwise, just change the register, no problem.  :)
   SmallContents.RegNo = Reg;
 }
@@ -144,7 +144,7 @@ void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
   if (isReg() && getParent() && getParent()->getParent() &&
       getParent()->getParent()->getParent())
     RemoveRegOperandFromRegInfo();
-  
+
   OpKind = MO_Immediate;
   Contents.ImmVal = ImmVal;
 }
@@ -155,7 +155,7 @@ void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
 void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
                                       bool isKill, bool isDead, bool isUndef,
                                       bool isDebug) {
-  // If this operand is already a register operand, use setReg to update the 
+  // If this operand is already a register operand, use setReg to update the
   // register's use/def lists.
   if (isReg()) {
     assert(!isEarlyClobber());
@@ -189,7 +189,7 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   if (getType() != Other.getType() ||
       getTargetFlags() != Other.getTargetFlags())
     return false;
-  
+
   switch (getType()) {
   default: llvm_unreachable("Unrecognized operand type");
   case MachineOperand::MO_Register:
@@ -322,7 +322,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
   default:
     llvm_unreachable("Unrecognized operand type");
   }
-  
+
   if (unsigned TF = getTargetFlags())
     OS << "[TF=" << TF << ']';
 }
@@ -408,7 +408,7 @@ uint64_t MachineMemOperand::getAlignment() const {
 raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
   assert((MMO.isLoad() || MMO.isStore()) &&
          "SV has to be a load, store or both.");
-  
+
   if (MMO.isVolatile())
     OS << "Volatile ";
 
@@ -417,7 +417,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
   if (MMO.isStore())
     OS << "ST";
   OS << MMO.getSize();
-  
+
   // Print the address information.
   OS << "[";
   if (!MMO.getValue())
@@ -464,7 +464,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineMemOperand &MMO) {
 /// MachineInstr ctor - This constructor creates a dummy MachineInstr with
 /// MCID NULL and no operands.
 MachineInstr::MachineInstr()
-  : MCID(0), NumImplicitOps(0), Flags(0), AsmPrinterFlags(0),
+  : MCID(0), Flags(0), AsmPrinterFlags(0),
     MemRefs(0), MemRefsEnd(0),
     Parent(0) {
   // Make sure that we get added to a machine basicblock
@@ -484,8 +484,9 @@ void MachineInstr::addImplicitDefUseOperands() {
 /// implicit operands. It reserves space for the number of operands specified by
 /// the MCInstrDesc.
 MachineInstr::MachineInstr(const MCInstrDesc &tid, bool NoImp)
-  : MCID(&tid), NumImplicitOps(0), Flags(0), AsmPrinterFlags(0),
+  : MCID(&tid), Flags(0), AsmPrinterFlags(0),
     MemRefs(0), MemRefsEnd(0), Parent(0) {
+  unsigned NumImplicitOps = 0;
   if (!NoImp)
     NumImplicitOps = MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
   Operands.reserve(NumImplicitOps + MCID->getNumOperands());
@@ -498,8 +499,9 @@ MachineInstr::MachineInstr(const MCInstrDesc &tid, bool NoImp)
 /// MachineInstr ctor - As above, but with a DebugLoc.
 MachineInstr::MachineInstr(const MCInstrDesc &tid, const DebugLoc dl,
                            bool NoImp)
-  : MCID(&tid), NumImplicitOps(0), Flags(0), AsmPrinterFlags(0),
+  : MCID(&tid), Flags(0), AsmPrinterFlags(0),
     MemRefs(0), MemRefsEnd(0), Parent(0), debugLoc(dl) {
+  unsigned NumImplicitOps = 0;
   if (!NoImp)
     NumImplicitOps = MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
   Operands.reserve(NumImplicitOps + MCID->getNumOperands());
@@ -510,13 +512,14 @@ MachineInstr::MachineInstr(const MCInstrDesc &tid, const DebugLoc dl,
 }
 
 /// MachineInstr ctor - Work exactly the same as the ctor two above, except
-/// that the MachineInstr is created and added to the end of the specified 
+/// that the MachineInstr is created and added to the end of the specified
 /// basic block.
 MachineInstr::MachineInstr(MachineBasicBlock *MBB, const MCInstrDesc &tid)
-  : MCID(&tid), NumImplicitOps(0), Flags(0), AsmPrinterFlags(0),
+  : MCID(&tid), Flags(0), AsmPrinterFlags(0),
     MemRefs(0), MemRefsEnd(0), Parent(0) {
   assert(MBB && "Cannot use inserting ctor with null basic block!");
-  NumImplicitOps = MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
+  unsigned NumImplicitOps =
+    MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
   Operands.reserve(NumImplicitOps + MCID->getNumOperands());
   addImplicitDefUseOperands();
   // Make sure that we get added to a machine basicblock
@@ -528,10 +531,11 @@ MachineInstr::MachineInstr(MachineBasicBlock *MBB, const MCInstrDesc &tid)
 ///
 MachineInstr::MachineInstr(MachineBasicBlock *MBB, const DebugLoc dl,
                            const MCInstrDesc &tid)
-  : MCID(&tid), NumImplicitOps(0), Flags(0), AsmPrinterFlags(0),
+  : MCID(&tid), Flags(0), AsmPrinterFlags(0),
     MemRefs(0), MemRefsEnd(0), Parent(0), debugLoc(dl) {
   assert(MBB && "Cannot use inserting ctor with null basic block!");
-  NumImplicitOps = MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
+  unsigned NumImplicitOps =
+    MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
   Operands.reserve(NumImplicitOps + MCID->getNumOperands());
   addImplicitDefUseOperands();
   // Make sure that we get added to a machine basicblock
@@ -542,7 +546,7 @@ MachineInstr::MachineInstr(MachineBasicBlock *MBB, const DebugLoc dl,
 /// MachineInstr ctor - Copies MachineInstr arg exactly
 ///
 MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
-  : MCID(&MI.getDesc()), NumImplicitOps(0), Flags(0), AsmPrinterFlags(0),
+  : MCID(&MI.getDesc()), Flags(0), AsmPrinterFlags(0),
     MemRefs(MI.MemRefs), MemRefsEnd(MI.MemRefsEnd),
     Parent(0), debugLoc(MI.getDebugLoc()) {
   Operands.reserve(MI.getNumOperands());
@@ -550,7 +554,6 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
   // Add operands
   for (unsigned i = 0; i != MI.getNumOperands(); ++i)
     addOperand(MI.getOperand(i));
-  NumImplicitOps = MI.NumImplicitOps;
 
   // Copy all the flags.
   Flags = MI.Flags;
@@ -605,102 +608,74 @@ void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &RegInfo) {
 /// addOperand - Add the specified operand to the instruction.  If it is an
 /// implicit operand, it is added to the end of the operand list.  If it is
 /// an explicit operand it is added at the end of the explicit operand list
-/// (before the first implicit operand). 
+/// (before the first implicit operand).
 void MachineInstr::addOperand(const MachineOperand &Op) {
+  assert(MCID && "Cannot add operands before providing an instr descriptor");
   bool isImpReg = Op.isReg() && Op.isImplicit();
-  assert((isImpReg || !OperandsComplete()) &&
-         "Trying to add an operand to a machine instr that is already done!");
-
   MachineRegisterInfo *RegInfo = getRegInfo();
 
-  // If we are adding the operand to the end of the list, our job is simpler.
-  // This is true most of the time, so this is a reasonable optimization.
-  if (isImpReg || NumImplicitOps == 0) {
-    // We can only do this optimization if we know that the operand list won't
-    // reallocate.
-    if (Operands.empty() || Operands.size()+1 <= Operands.capacity()) {
-      Operands.push_back(Op);
-    
-      // Set the parent of the operand.
-      Operands.back().ParentMI = this;
-  
-      // If the operand is a register, update the operand's use list.
-      if (Op.isReg()) {
-        Operands.back().AddRegOperandToRegInfo(RegInfo);
-        // If the register operand is flagged as early, mark the operand as such
-        unsigned OpNo = Operands.size() - 1;
-        if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
-          Operands[OpNo].setIsEarlyClobber(true);
-      }
-      return;
+  // If the Operands backing store is reallocated, all register operands must
+  // be removed and re-added to RegInfo.  It is storing pointers to operands.
+  bool Reallocate = RegInfo &&
+    !Operands.empty() && Operands.size() == Operands.capacity();
+
+  // Find the insert location for the new operand.  Implicit registers go at
+  // the end, everything goes before the implicit regs.
+  unsigned OpNo = Operands.size();
+
+  // Remove all the implicit operands from RegInfo if they need to be shifted.
+  // FIXME: Allow mixed explicit and implicit operands on inline asm.
+  // InstrEmitter::EmitSpecialNode() is marking inline asm clobbers as
+  // implicit-defs, but they must not be moved around.  See the FIXME in
+  // InstrEmitter.cpp.
+  if (!isImpReg && !isInlineAsm()) {
+    while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
+      --OpNo;
+      if (RegInfo)
+        Operands[OpNo].RemoveRegOperandFromRegInfo();
     }
   }
-  
-  // Otherwise, we have to insert a real operand before any implicit ones.
-  unsigned OpNo = Operands.size()-NumImplicitOps;
 
-  // If this instruction isn't embedded into a function, then we don't need to
-  // update any operand lists.
-  if (RegInfo == 0) {
-    // Simple insertion, no reginfo update needed for other register operands.
-    Operands.insert(Operands.begin()+OpNo, Op);
-    Operands[OpNo].ParentMI = this;
-
-    // Do explicitly set the reginfo for this operand though, to ensure the
-    // next/prev fields are properly nulled out.
-    if (Operands[OpNo].isReg()) {
-      Operands[OpNo].AddRegOperandToRegInfo(0);
-      // If the register operand is flagged as early, mark the operand as such
-      if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
-        Operands[OpNo].setIsEarlyClobber(true);
-    }
+  // OpNo now points as the desired insertion point.  Unless this is a variadic
+  // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
+  assert((isImpReg || MCID->isVariadic() || OpNo < MCID->getNumOperands()) &&
+         "Trying to add an operand to a machine instr that is already done!");
 
-  } else if (Operands.size()+1 <= Operands.capacity()) {
-    // Otherwise, we have to remove register operands from their register use
-    // list, add the operand, then add the register operands back to their use
-    // list.  This also must handle the case when the operand list reallocates
-    // to somewhere else.
-  
-    // If insertion of this operand won't cause reallocation of the operand
-    // list, just remove the implicit operands, add the operand, then re-add all
-    // the rest of the operands.
-    for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
-      assert(Operands[i].isReg() && "Should only be an implicit reg!");
-      Operands[i].RemoveRegOperandFromRegInfo();
-    }
-    
-    // Add the operand.  If it is a register, add it to the reg list.
-    Operands.insert(Operands.begin()+OpNo, Op);
-    Operands[OpNo].ParentMI = this;
-
-    if (Operands[OpNo].isReg()) {
-      Operands[OpNo].AddRegOperandToRegInfo(RegInfo);
-      // If the register operand is flagged as early, mark the operand as such
-      if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
-        Operands[OpNo].setIsEarlyClobber(true);
-    }
-    
-    // Re-add all the implicit ops.
-    for (unsigned i = OpNo+1, e = Operands.size(); i != e; ++i) {
+  // All operands from OpNo have been removed from RegInfo.  If the Operands
+  // backing store needs to be reallocated, we also need to remove any other
+  // register operands.
+  if (Reallocate)
+    for (unsigned i = 0; i != OpNo; ++i)
+      if (Operands[i].isReg())
+        Operands[i].RemoveRegOperandFromRegInfo();
+
+  // Insert the new operand at OpNo.
+  Operands.insert(Operands.begin() + OpNo, Op);
+  Operands[OpNo].ParentMI = this;
+
+  // The Operands backing store has now been reallocated, so we can re-add the
+  // operands before OpNo.
+  if (Reallocate)
+    for (unsigned i = 0; i != OpNo; ++i)
+      if (Operands[i].isReg())
+        Operands[i].AddRegOperandToRegInfo(RegInfo);
+
+  // When adding a register operand, tell RegInfo about it.
+  if (Operands[OpNo].isReg()) {
+    // Add the new operand to RegInfo, even when RegInfo is NULL.
+    // This will initialize the linked list pointers.
+    Operands[OpNo].AddRegOperandToRegInfo(RegInfo);
+    // If the register operand is flagged as early, mark the operand as such.
+    if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
+      Operands[OpNo].setIsEarlyClobber(true);
+  }
+
+  // Re-add all the implicit ops.
+  if (RegInfo) {
+    for (unsigned i = OpNo + 1, e = Operands.size(); i != e; ++i) {
       assert(Operands[i].isReg() && "Should only be an implicit reg!");
       Operands[i].AddRegOperandToRegInfo(RegInfo);
     }
-  } else {
-    // Otherwise, we will be reallocating the operand list.  Remove all reg
-    // operands from their list, then readd them after the operand list is
-    // reallocated.
-    RemoveRegOperandsFromUseLists();
-    
-    Operands.insert(Operands.begin()+OpNo, Op);
-    Operands[OpNo].ParentMI = this;
-  
-    // Re-add all the operands.
-    AddRegOperandsToUseLists(*RegInfo);
-
-      // If the register operand is flagged as early, mark the operand as such
-    if (Operands[OpNo].isReg()
-        && MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
-      Operands[OpNo].setIsEarlyClobber(true);
   }
 }
 
@@ -709,13 +684,13 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
 ///
 void MachineInstr::RemoveOperand(unsigned OpNo) {
   assert(OpNo < Operands.size() && "Invalid operand number");
-  
+
   // Special case removing the last one.
   if (OpNo == Operands.size()-1) {
     // If needed, remove from the reg def/use list.
     if (Operands.back().isReg() && Operands.back().isOnRegUseList())
       Operands.back().RemoveRegOperandFromRegInfo();
-    
+
     Operands.pop_back();
     return;
   }
@@ -730,7 +705,7 @@ void MachineInstr::RemoveOperand(unsigned OpNo) {
         Operands[i].RemoveRegOperandFromRegInfo();
     }
   }
-  
+
   Operands.erase(Operands.begin()+OpNo);
 
   if (RegInfo) {
@@ -827,15 +802,6 @@ void MachineInstr::eraseFromParent() {
 }
 
 
-/// OperandComplete - Return true if it's illegal to add a new operand
-///
-bool MachineInstr::OperandsComplete() const {
-  unsigned short NumOperands = MCID->getNumOperands();
-  if (!MCID->isVariadic() && getNumOperands()-NumImplicitOps >= NumOperands)
-    return true;  // Broken: we have all the operands of this instruction!
-  return false;
-}
-
 /// getNumExplicitOperands - Returns the number of non-implicit operands.
 ///
 unsigned MachineInstr::getNumExplicitOperands() const {
@@ -860,6 +826,67 @@ bool MachineInstr::isStackAligningInlineAsm() const {
   return false;
 }
 
+int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
+                                       unsigned *GroupNo) const {
+  assert(isInlineAsm() && "Expected an inline asm instruction");
+  assert(OpIdx < getNumOperands() && "OpIdx out of range");
+
+  // Ignore queries about the initial operands.
+  if (OpIdx < InlineAsm::MIOp_FirstOperand)
+    return -1;
+
+  unsigned Group = 0;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    // If we reach the implicit register operands, stop looking.
+    if (!FlagMO.isImm())
+      return -1;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    if (i + NumOps > OpIdx) {
+      if (GroupNo)
+        *GroupNo = Group;
+      return i;
+    }
+    ++Group;
+  }
+  return -1;
+}
+
+const TargetRegisterClass*
+MachineInstr::getRegClassConstraint(unsigned OpIdx,
+                                    const TargetInstrInfo *TII,
+                                    const TargetRegisterInfo *TRI) const {
+  // Most opcodes have fixed constraints in their MCInstrDesc.
+  if (!isInlineAsm())
+    return TII->getRegClass(getDesc(), OpIdx, TRI);
+
+  if (!getOperand(OpIdx).isReg())
+    return NULL;
+
+  // For tied uses on inline asm, get the constraint from the def.
+  unsigned DefIdx;
+  if (getOperand(OpIdx).isUse() && isRegTiedToDefOperand(OpIdx, &DefIdx))
+    OpIdx = DefIdx;
+
+  // Inline asm stores register class constraints in the flag word.
+  int FlagIdx = findInlineAsmFlagIdx(OpIdx);
+  if (FlagIdx < 0)
+    return NULL;
+
+  unsigned Flag = getOperand(FlagIdx).getImm();
+  unsigned RCID;
+  if (InlineAsm::hasRegClassConstraint(Flag, RCID))
+    return TRI->getRegClass(RCID);
+
+  // Assume that all registers in a memory operand are pointers.
+  if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
+    return TRI->getPointerRegClass();
+
+  return NULL;
+}
+
 /// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
 /// the specific register or -1 if it is not found. It further tightens
 /// the search criteria to a use that kills the register if isKill is true.
@@ -901,7 +928,8 @@ MachineInstr::readsWritesVirtualRegister(unsigned Reg,
       Ops->push_back(i);
     if (MO.isUse())
       Use |= !MO.isUndef();
-    else if (MO.getSubReg())
+    else if (MO.getSubReg() && !MO.isUndef())
+      // A partial <def,undef> doesn't count as reading the register.
       PartDef = true;
     else
       FullDef = true;
@@ -941,6 +969,10 @@ MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
 /// operand list that is used to represent the predicate. It returns -1 if
 /// none is found.
 int MachineInstr::findFirstPredOperandIdx() const {
+  // Don't call MCID.findFirstPredOperandIdx() because this variant
+  // is sometimes called on an instruction that's not yet complete, and
+  // so the number of operands is less than the MCID indicates. In
+  // particular, the PTX target does this.
   const MCInstrDesc &MCID = getDesc();
   if (MCID.isPredicable()) {
     for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
@@ -950,7 +982,7 @@ int MachineInstr::findFirstPredOperandIdx() const {
 
   return -1;
 }
-  
+
 /// isRegTiedToUseOperand - Given the index of a register def operand,
 /// check if the register def is tied to a source operand, due to either
 /// two-address elimination or inline assembly constraints. Returns the
@@ -964,23 +996,13 @@ isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
       return false;
     // Determine the actual operand index that corresponds to this index.
     unsigned DefNo = 0;
-    unsigned DefPart = 0;
-    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands();
-         i < e; ) {
-      const MachineOperand &FMO = getOperand(i);
-      // After the normal asm operands there may be additional imp-def regs.
-      if (!FMO.isImm())
-        return false;
-      // Skip over this def.
-      unsigned NumOps = InlineAsm::getNumOperandRegisters(FMO.getImm());
-      unsigned PrevDef = i + 1;
-      i = PrevDef + NumOps;
-      if (i > DefOpIdx) {
-        DefPart = DefOpIdx - PrevDef;
-        break;
-      }
-      ++DefNo;
-    }
+    int FlagIdx = findInlineAsmFlagIdx(DefOpIdx, &DefNo);
+    if (FlagIdx < 0)
+      return false;
+
+    // Which part of the group is DefOpIdx?
+    unsigned DefPart = DefOpIdx - (FlagIdx + 1);
+
     for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands();
          i != e; ++i) {
       const MachineOperand &FMO = getOperand(i);
@@ -1024,20 +1046,10 @@ isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
       return false;
 
     // Find the flag operand corresponding to UseOpIdx
-    unsigned FlagIdx, NumOps=0;
-    for (FlagIdx = InlineAsm::MIOp_FirstOperand;
-         FlagIdx < UseOpIdx; FlagIdx += NumOps+1) {
-      const MachineOperand &UFMO = getOperand(FlagIdx);
-      // After the normal asm operands there may be additional imp-def regs.
-      if (!UFMO.isImm())
-        return false;
-      NumOps = InlineAsm::getNumOperandRegisters(UFMO.getImm());
-      assert(NumOps < getNumOperands() && "Invalid inline asm flag");
-      if (UseOpIdx < FlagIdx+NumOps+1)
-        break;
-    }
-    if (FlagIdx >= UseOpIdx)
+    int FlagIdx = findInlineAsmFlagIdx(UseOpIdx);
+    if (FlagIdx < 0)
       return false;
+
     const MachineOperand &UFMO = getOperand(FlagIdx);
     unsigned DefNo;
     if (InlineAsm::isUseOperandTiedToDef(UFMO.getImm(), DefNo)) {
@@ -1211,7 +1223,7 @@ bool MachineInstr::hasVolatileMemoryRef() const {
   // conservatively assume it wasn't preserved.
   if (memoperands_empty())
     return true;
-  
+
   // Check the memory reference information for volatile references.
   for (mmo_iterator I = memoperands_begin(), E = memoperands_end(); I != E; ++I)
     if ((*I)->isVolatile())
@@ -1318,7 +1330,7 @@ void MachineInstr::dump() const {
   dbgs() << "  " << *this;
 }
 
-static void printDebugLoc(DebugLoc DL, const MachineFunction *MF, 
+static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
                          raw_ostream &CommentOS) {
   const LLVMContext &Ctx = MF->getFunction()->getContext();
   if (!DL.isUnknown()) {          // Print source line info.
@@ -1380,7 +1392,7 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   unsigned AsmDescOp = ~0u;
   unsigned AsmOpCount = 0;
 
-  if (isInlineAsm()) {
+  if (isInlineAsm() && e >= InlineAsm::MIOp_FirstOperand) {
     // Print asm string.
     OS << " ";
     getOperand(InlineAsm::MIOp_AsmString).print(OS, TM);
@@ -1451,18 +1463,28 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
       OS << '$' << AsmOpCount++;
       unsigned Flag = MO.getImm();
       switch (InlineAsm::getKind(Flag)) {
-      case InlineAsm::Kind_RegUse:             OS << ":[reguse]"; break;
-      case InlineAsm::Kind_RegDef:             OS << ":[regdef]"; break;
-      case InlineAsm::Kind_RegDefEarlyClobber: OS << ":[regdef-ec]"; break;
-      case InlineAsm::Kind_Clobber:            OS << ":[clobber]"; break;
-      case InlineAsm::Kind_Imm:                OS << ":[imm]"; break;
-      case InlineAsm::Kind_Mem:                OS << ":[mem]"; break;
-      default: OS << ":[??" << InlineAsm::getKind(Flag) << ']'; break;
+      case InlineAsm::Kind_RegUse:             OS << ":[reguse"; break;
+      case InlineAsm::Kind_RegDef:             OS << ":[regdef"; break;
+      case InlineAsm::Kind_RegDefEarlyClobber: OS << ":[regdef-ec"; break;
+      case InlineAsm::Kind_Clobber:            OS << ":[clobber"; break;
+      case InlineAsm::Kind_Imm:                OS << ":[imm"; break;
+      case InlineAsm::Kind_Mem:                OS << ":[mem"; break;
+      default: OS << ":[??" << InlineAsm::getKind(Flag); break;
+      }
+
+      unsigned RCID = 0;
+      if (InlineAsm::hasRegClassConstraint(Flag, RCID)) {
+        if (TM)
+          OS << ':' << TM->getRegisterInfo()->getRegClass(RCID)->getName();
+        else
+          OS << ":RC" << RCID;
       }
 
       unsigned TiedTo = 0;
       if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
-        OS << " [tiedto:$" << TiedTo << ']';
+        OS << " tiedto:$" << TiedTo;
+
+      OS << ']';
 
       // Compute the index of the next operand descriptor.
       AsmDescOp += 1 + InlineAsm::getNumOperandRegisters(Flag);
@@ -1516,7 +1538,19 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
   }
 
   // Print debug location information.
-  if (!debugLoc.isUnknown() && MF) {
+  if (isDebugValue() && getOperand(e - 1).isMetadata()) {
+    if (!HaveSemi) OS << ";"; HaveSemi = true;
+    DIVariable DV(getOperand(e - 1).getMetadata());
+    OS << " line no:" <<  DV.getLineNumber();
+    if (MDNode *InlinedAt = DV.getInlinedAt()) {
+      DebugLoc InlinedAtDL = DebugLoc::getFromDILocation(InlinedAt);
+      if (!InlinedAtDL.isUnknown()) {
+        OS << " inlined @[ ";
+        printDebugLoc(InlinedAtDL, MF, OS);
+        OS << " ]";
+      }
+    }
+  } else if (!debugLoc.isUnknown() && MF) {
     if (!HaveSemi) OS << ";"; HaveSemi = true;
     OS << " dbg:";
     printDebugLoc(debugLoc, MF, OS);
@@ -1627,7 +1661,7 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
   // new implicit operand if required.
   if (Found || !AddIfNotFound)
     return Found;
-    
+
   addOperand(MachineOperand::CreateReg(IncomingReg,
                                        true  /*IsDef*/,
                                        true  /*IsImp*/,
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 722ceb2..a1f80d5 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -37,10 +37,16 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+static cl::opt<bool>
+AvoidSpeculation("avoid-speculation",
+                 cl::desc("MachineLICM should avoid speculation"),
+                 cl::init(false), cl::Hidden);
+
 STATISTIC(NumHoisted,
           "Number of machine instructions hoisted out of loops");
 STATISTIC(NumLowRP,
@@ -91,6 +97,17 @@ namespace {
     // For each opcode, keep a list of potential CSE instructions.
     DenseMap<unsigned, std::vector<const MachineInstr*> > CSEMap;
 
+    enum {
+      SpeculateFalse   = 0,
+      SpeculateTrue    = 1,
+      SpeculateUnknown = 2
+    };
+
+    // If a MBB does not dominate loop exiting blocks then it may not safe
+    // to hoist loads from this block.
+    // Tri-state: 0 - false, 1 - true, 2 - unknown
+    unsigned SpeculationState;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     MachineLICM() :
@@ -194,6 +211,10 @@ namespace {
     /// hoist the given loop invariant.
     bool IsProfitableToHoist(MachineInstr &MI);
 
+    /// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
+    /// If not then a load from this mbb may not be safe to hoist.
+    bool IsGuaranteedToExecute(MachineBasicBlock *BB);
+
     /// HoistRegion - Walk the specified region of the CFG (defined by all
     /// blocks dominated by the specified block, and that are in the current
     /// loop) in depth first order w.r.t the DominatorTree. This allows us to
@@ -202,6 +223,13 @@ namespace {
     ///
     void HoistRegion(MachineDomTreeNode *N, bool IsHeader = false);
 
+    /// getRegisterClassIDAndCost - For a given MI, register, and the operand
+    /// index, return the ID and cost of its representative register class by
+    /// reference.
+    void getRegisterClassIDAndCost(const MachineInstr *MI,
+                                   unsigned Reg, unsigned OpIdx,
+                                   unsigned &RCId, unsigned &RCCost) const;
+
     /// InitRegPressure - Find all virtual register references that are liveout
     /// of the preheader to initialize the starting "register pressure". Note
     /// this does not count live through (livein but not used) registers.
@@ -229,6 +257,10 @@ namespace {
     bool EliminateCSE(MachineInstr *MI,
            DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI);
 
+    /// MayCSE - Return true if the given instruction will be CSE'd if it's
+    /// hoisted out of the loop.
+    bool MayCSE(MachineInstr *MI);
+
     /// Hoist - When an instruction is found to only use loop invariant operands
     /// that is safe to hoist, this instruction is called to do the dirty work.
     /// It returns true if the instruction is hoisted.
@@ -441,6 +473,12 @@ void MachineLICM::HoistRegionPostRA() {
   const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *BB = Blocks[i];
+
+    // If the header of the loop containing this basic block is a landing pad,
+    // then don't try to hoist instructions out of this loop.
+    const MachineLoop *ML = MLI->getLoopFor(BB);
+    if (ML && ML->getHeader()->isLandingPad()) continue;
+
     // Conservatively treat live-in's as an external def.
     // FIXME: That means a reload that're reused in successor block(s) will not
     // be LICM'ed.
@@ -452,6 +490,7 @@ void MachineLICM::HoistRegionPostRA() {
         ++PhysRegDefs[*AS];
     }
 
+    SpeculationState = SpeculateUnknown;
     for (MachineBasicBlock::iterator
            MII = BB->begin(), E = BB->end(); MII != E; ++MII) {
       MachineInstr *MI = &*MII;
@@ -545,6 +584,27 @@ void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
   Changed = true;
 }
 
+// IsGuaranteedToExecute - Check if this mbb is guaranteed to execute.
+// If not then a load from this mbb may not be safe to hoist.
+bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
+  if (SpeculationState != SpeculateUnknown)
+    return SpeculationState == SpeculateFalse;
+    
+  if (BB != CurLoop->getHeader()) {
+    // Check loop exiting blocks.
+    SmallVector<MachineBasicBlock*, 8> CurrentLoopExitingBlocks;
+    CurLoop->getExitingBlocks(CurrentLoopExitingBlocks);
+    for (unsigned i = 0, e = CurrentLoopExitingBlocks.size(); i != e; ++i)
+      if (!DT->dominates(BB, CurrentLoopExitingBlocks[i])) {
+        SpeculationState = SpeculateTrue;
+        return false;
+      }
+  }
+
+  SpeculationState = SpeculateFalse;
+  return true;
+}
+
 /// HoistRegion - Walk the specified region of the CFG (defined by all blocks
 /// dominated by the specified block, and that are in the current loop) in depth
 /// first order w.r.t the DominatorTree. This allows us to visit definitions
@@ -554,6 +614,11 @@ void MachineLICM::HoistRegion(MachineDomTreeNode *N, bool IsHeader) {
   assert(N != 0 && "Null dominator tree node?");
   MachineBasicBlock *BB = N->getBlock();
 
+  // If the header of the loop containing this basic block is a landing pad,
+  // then don't try to hoist instructions out of this loop.
+  const MachineLoop *ML = MLI->getLoopFor(BB);
+  if (ML && ML->getHeader()->isLandingPad()) return;
+
   // If this subregion is not in the top level loop at all, exit.
   if (!CurLoop->contains(BB)) return;
 
@@ -571,6 +636,7 @@ void MachineLICM::HoistRegion(MachineDomTreeNode *N, bool IsHeader) {
   // Remember livein register pressure.
   BackTrace.push_back(RegPressure);
 
+  SpeculationState = SpeculateUnknown;
   for (MachineBasicBlock::iterator
          MII = BB->begin(), E = BB->end(); MII != E; ) {
     MachineBasicBlock::iterator NextMII = MII; ++NextMII;
@@ -596,6 +662,23 @@ static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
   return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
 }
 
+/// getRegisterClassIDAndCost - For a given MI, register, and the operand
+/// index, return the ID and cost of its representative register class.
+void
+MachineLICM::getRegisterClassIDAndCost(const MachineInstr *MI,
+                                       unsigned Reg, unsigned OpIdx,
+                                       unsigned &RCId, unsigned &RCCost) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+  EVT VT = *RC->vt_begin();
+  if (VT == MVT::untyped) {
+    RCId = RC->getID();
+    RCCost = 1;
+  } else {
+    RCId = TLI->getRepRegClassFor(VT)->getID();
+    RCCost = TLI->getRepRegClassCostFor(VT);
+  }
+}
+                                      
 /// InitRegPressure - Find all virtual register references that are liveout of
 /// the preheader to initialize the starting "register pressure". Note this
 /// does not count live through (livein but not used) registers.
@@ -625,18 +708,17 @@ void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
         continue;
 
       bool isNew = RegSeen.insert(Reg);
-      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-      EVT VT = *RC->vt_begin();
-      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      unsigned RCId, RCCost;
+      getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
       if (MO.isDef())
-        RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+        RegPressure[RCId] += RCCost;
       else {
         bool isKill = isOperandKill(MO, MRI);
         if (isNew && !isKill)
           // Haven't seen this, it must be a livein.
-          RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+          RegPressure[RCId] += RCCost;
         else if (!isNew && isKill)
-          RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+          RegPressure[RCId] -= RCCost;
       }
     }
   }
@@ -661,11 +743,8 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI) {
     if (MO.isDef())
       Defs.push_back(Reg);
     else if (!isNew && isOperandKill(MO, MRI)) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-      EVT VT = *RC->vt_begin();
-      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-      unsigned RCCost = TLI->getRepRegClassCostFor(VT);
-
+      unsigned RCId, RCCost;
+      getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
       if (RCCost > RegPressure[RCId])
         RegPressure[RCId] = 0;
       else
@@ -673,13 +752,13 @@ void MachineLICM::UpdateRegPressure(const MachineInstr *MI) {
     }
   }
 
+  unsigned Idx = 0;
   while (!Defs.empty()) {
     unsigned Reg = Defs.pop_back_val();
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    EVT VT = *RC->vt_begin();
-    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-    unsigned RCCost = TLI->getRepRegClassCostFor(VT);
+    unsigned RCId, RCCost;
+    getRegisterClassIDAndCost(MI, Reg, Idx, RCId, RCCost);
     RegPressure[RCId] += RCCost;
+    ++Idx;
   }
 }
 
@@ -691,7 +770,14 @@ bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
   bool DontMoveAcrossStore = true;
   if (!I.isSafeToMove(TII, AA, DontMoveAcrossStore))
     return false;
-  
+
+  // If it is load then check if it is guaranteed to execute by making sure that
+  // it dominates all exiting blocks. If it doesn't, then there is a path out of
+  // the loop which does not execute this load, so we can't hoist it.
+  // Stores and side effects are already checked by isSafeToMove.
+  if (I.getDesc().mayLoad() && !IsGuaranteedToExecute(I.getParent()))
+    return false;
+
   return true;
 }
 
@@ -879,10 +965,8 @@ void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
 
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    EVT VT = *RC->vt_begin();
-    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-    unsigned RCCost = TLI->getRepRegClassCostFor(VT);
+    unsigned RCId, RCCost;
+    getRegisterClassIDAndCost(MI, Reg, i, RCId, RCCost);
     if (MO.isDef()) {
       DenseMap<unsigned, int>::iterator CI = Cost.find(RCId);
       if (CI != Cost.end())
@@ -941,16 +1025,15 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
       unsigned Reg = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(Reg))
         continue;
+
+      unsigned RCId, RCCost;
+      getRegisterClassIDAndCost(&MI, Reg, i, RCId, RCCost);
       if (MO.isDef()) {
         if (HasHighOperandLatency(MI, i, Reg)) {
           ++NumHighLatency;
           return true;
         }
 
-        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-        EVT VT = *RC->vt_begin();
-        unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-        unsigned RCCost = TLI->getRepRegClassCostFor(VT);
         DenseMap<unsigned, int>::iterator CI = Cost.find(RCId);
         if (CI != Cost.end())
           CI->second += RCCost;
@@ -960,10 +1043,6 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
         // Is a virtual register use is a kill, hoisting it out of the loop
         // may actually reduce register pressure or be register pressure
         // neutral.
-        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-        EVT VT = *RC->vt_begin();
-        unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
-        unsigned RCCost = TLI->getRepRegClassCostFor(VT);
         DenseMap<unsigned, int>::iterator CI = Cost.find(RCId);
         if (CI != Cost.end())
           CI->second -= RCCost;
@@ -979,6 +1058,13 @@ bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
       return true;
     }
 
+    // Do not "speculate" in high register pressure situation. If an
+    // instruction is not guaranteed to be executed in the loop, it's best to be
+    // conservative.
+    if (AvoidSpeculation &&
+        (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI)))
+      return false;
+
     // High register pressure situation, only hoist if the instruction is going to
     // be remat'ed.
     if (!TII->isTriviallyReMaterializable(&MI, AA) &&
@@ -1116,6 +1202,20 @@ bool MachineLICM::EliminateCSE(MachineInstr *MI,
   return false;
 }
 
+/// MayCSE - Return true if the given instruction will be CSE'd if it's
+/// hoisted out of the loop.
+bool MachineLICM::MayCSE(MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+    CI = CSEMap.find(Opcode);
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  return LookForDuplicate(MI, CI->second) != 0;
+}
+
 /// Hoist - When an instruction is found to use only loop invariant operands
 /// that are safe to hoist, this instruction is called to do the dirty work.
 ///
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index fadc594..80c4854 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -17,9 +17,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/Support/Dwarf.h"
@@ -254,11 +252,12 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
 //===----------------------------------------------------------------------===//
 
 MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI,
-                                     const TargetAsmInfo *TAI)
-: ImmutablePass(ID), Context(MAI, TAI),
-  ObjFileMMI(0),
-  CurCallSite(0), CallsEHReturn(0), CallsUnwindInit(0), DbgInfoAvailable(false),
-  CallsExternalVAFunctionWithFloatingPointArguments(false) {
+                                     const MCRegisterInfo &MRI,
+                                     const MCObjectFileInfo *MOFI)
+  : ImmutablePass(ID), Context(MAI, MRI, MOFI),
+    ObjFileMMI(0), CompactUnwindEncoding(0), CurCallSite(0), CallsEHReturn(0),
+    CallsUnwindInit(0), DbgInfoAvailable(false),
+    CallsExternalVAFunctionWithFloatingPointArguments(false) {
   initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
   // Always emit some info, by default "no personality" info.
   Personalities.push_back(NULL);
@@ -267,7 +266,8 @@ MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI,
 }
 
 MachineModuleInfo::MachineModuleInfo()
-: ImmutablePass(ID), Context(*(MCAsmInfo*)0, NULL) {
+  : ImmutablePass(ID),
+    Context(*(MCAsmInfo*)0, *(MCRegisterInfo*)0, (MCObjectFileInfo*)0) {
   assert(0 && "This MachineModuleInfo constructor should never be called, MMI "
          "should always be explicitly constructed by LLVMTargetMachine");
   abort();
@@ -311,6 +311,7 @@ void MachineModuleInfo::EndFunction() {
   FilterEnds.clear();
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
+  CompactUnwindEncoding = 0;
   VariableDbgInfo.clear();
 }
 
@@ -426,8 +427,9 @@ void MachineModuleInfo::addPersonality(MachineBasicBlock *LandingPad,
 
 /// addCatchTypeInfo - Provide the catch typeinfo for a landing pad.
 ///
-void MachineModuleInfo::addCatchTypeInfo(MachineBasicBlock *LandingPad,
-                                  std::vector<const GlobalVariable *> &TyInfo) {
+void MachineModuleInfo::
+addCatchTypeInfo(MachineBasicBlock *LandingPad,
+                 ArrayRef<const GlobalVariable *> TyInfo) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   for (unsigned N = TyInfo.size(); N; --N)
     LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
@@ -435,8 +437,9 @@ void MachineModuleInfo::addCatchTypeInfo(MachineBasicBlock *LandingPad,
 
 /// addFilterTypeInfo - Provide the filter typeinfo for a landing pad.
 ///
-void MachineModuleInfo::addFilterTypeInfo(MachineBasicBlock *LandingPad,
-                                  std::vector<const GlobalVariable *> &TyInfo) {
+void MachineModuleInfo::
+addFilterTypeInfo(MachineBasicBlock *LandingPad,
+                  ArrayRef<const GlobalVariable *> TyInfo) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   std::vector<unsigned> IdsInFilter(TyInfo.size());
   for (unsigned I = 0, E = TyInfo.size(); I != E; ++I)
@@ -496,6 +499,14 @@ void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
   }
 }
 
+/// setCallSiteLandingPad - Map the landing pad's EH symbol to the call site
+/// indexes.
+void MachineModuleInfo::setCallSiteLandingPad(MCSymbol *Sym,
+                                              ArrayRef<unsigned> Sites) {
+  for (unsigned I = 0, E = Sites.size(); I != E; ++I)
+    LPadToCallSiteMap[Sym].push_back(Sites[I]);
+}
+
 /// getTypeIDFor - Return the type id for the specified typeinfo.  This is
 /// function wide.
 unsigned MachineModuleInfo::getTypeIDFor(const GlobalVariable *TI) {
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 4b3e64c..266ebf6 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -14,10 +14,11 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI) {
+MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
+  : TRI(&TRI), IsSSA(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedPhysRegs.resize(TRI.getNumRegs());
@@ -48,18 +49,47 @@ MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
 
 const TargetRegisterClass *
 MachineRegisterInfo::constrainRegClass(unsigned Reg,
-                                       const TargetRegisterClass *RC) {
+                                       const TargetRegisterClass *RC,
+                                       unsigned MinNumRegs) {
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   if (OldRC == RC)
     return RC;
-  const TargetRegisterClass *NewRC = getCommonSubClass(OldRC, RC);
-  if (!NewRC)
+  const TargetRegisterClass *NewRC = TRI->getCommonSubClass(OldRC, RC);
+  if (!NewRC || NewRC == OldRC)
+    return NewRC;
+  if (NewRC->getNumRegs() < MinNumRegs)
     return 0;
-  if (NewRC != OldRC)
-    setRegClass(Reg, NewRC);
+  setRegClass(Reg, NewRC);
   return NewRC;
 }
 
+bool
+MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  const TargetRegisterClass *OldRC = getRegClass(Reg);
+  const TargetRegisterClass *NewRC = TRI->getLargestLegalSuperClass(OldRC);
+
+  // Stop early if there is no room to grow.
+  if (NewRC == OldRC)
+    return false;
+
+  // Accumulate constraints from all uses.
+  for (reg_nodbg_iterator I = reg_nodbg_begin(Reg), E = reg_nodbg_end(); I != E;
+       ++I) {
+    // TRI doesn't have accurate enough information to model this yet.
+    if (I.getOperand().getSubReg())
+      return false;
+    const TargetRegisterClass *OpRC =
+      I->getRegClassConstraint(I.getOperandNo(), TII, TRI);
+    if (OpRC)
+      NewRC = TRI->getCommonSubClass(NewRC, OpRC);
+    if (!NewRC || NewRC == OldRC)
+      return false;
+  }
+  setRegClass(Reg, NewRC);
+  return true;
+}
+
 /// createVirtualRegister - Create and return a new virtual register in the
 /// function with the specified register class.
 ///
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 916dff7..29cfb49 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -382,6 +382,25 @@ static bool AvoidsSinking(MachineInstr *MI, MachineRegisterInfo *MRI) {
   return MI->isInsertSubreg() || MI->isSubregToReg() || MI->isRegSequence();
 }
 
+/// collectDebgValues - Scan instructions following MI and collect any 
+/// matching DBG_VALUEs.
+static void collectDebugValues(MachineInstr *MI, 
+                               SmallVector<MachineInstr *, 2> & DbgValues) {
+  DbgValues.clear();
+  if (!MI->getOperand(0).isReg())
+    return;
+
+  MachineBasicBlock::iterator DI = MI; ++DI;
+  for (MachineBasicBlock::iterator DE = MI->getParent()->end();
+       DI != DE; ++DI) {
+    if (!DI->isDebugValue())
+      return;
+    if (DI->getOperand(0).isReg() &&
+        DI->getOperand(0).getReg() == MI->getOperand(0).getReg())
+      DbgValues.push_back(DI);
+  }
+}
+
 /// SinkInstruction - Determine whether it is safe to sink the specified machine
 /// instruction out of its current block into a successor.
 bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
@@ -598,10 +617,22 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
   while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
     ++InsertPos;
 
+  // collect matching debug values.
+  SmallVector<MachineInstr *, 2> DbgValuesToSink;
+  collectDebugValues(MI, DbgValuesToSink);
+
   // Move the instruction.
   SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
                        ++MachineBasicBlock::iterator(MI));
 
+  // Move debug values.
+  for (SmallVector<MachineInstr *, 2>::iterator DBI = DbgValuesToSink.begin(),
+         DBE = DbgValuesToSink.end(); DBI != DBE; ++DBI) {
+    MachineInstr *DbgMI = *DBI;
+    SuccToSinkTo->splice(InsertPos, ParentBlock,  DbgMI,
+                         ++MachineBasicBlock::iterator(DbgMI));
+  }
+
   // Conservatively, clear any kill flags, since it's possible that they are no
   // longer correct.
   MI->clearKillInfo();
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 7a55852..26847d3 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -72,6 +72,8 @@ namespace {
     typedef DenseSet<unsigned> RegSet;
     typedef DenseMap<unsigned, const MachineInstr*> RegMap;
 
+    const MachineInstr *FirstTerminator;
+
     BitVector regsReserved;
     RegSet regsLive;
     RegVector regsDefined, regsDead, regsKilled;
@@ -389,6 +391,8 @@ static bool matchPair(MachineBasicBlock::const_succ_iterator i,
 
 void
 MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
+  FirstTerminator = 0;
+
   // Count the number of landing pad successors.
   SmallPtrSet<MachineBasicBlock*, 4> LandingPadSuccs;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
@@ -570,6 +574,18 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
   }
 
+  // Ensure non-terminators don't follow terminators.
+  if (MCID.isTerminator()) {
+    if (!FirstTerminator)
+      FirstTerminator = MI;
+  } else if (FirstTerminator) {
+    report("Non-terminator instruction after the first terminator", MI);
+    *OS << "First terminator was:\t" << *FirstTerminator;
+  }
+
+  StringRef ErrorInfo;
+  if (!TII->verifyInstruction(MI, ErrorInfo))
+    report(ErrorInfo.data(), MI);
 }
 
 void
@@ -686,6 +702,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       else
         addRegWithSubRegs(regsDefined, Reg);
 
+      // Verify SSA form.
+      if (MRI->isSSA() && TargetRegisterInfo::isVirtualRegister(Reg) &&
+          llvm::next(MRI->def_begin(Reg)) != MRI->def_end())
+        report("Multiple virtual register defs in SSA form", MO, MONum);
+
       // Check LiveInts for a live range, but only for virtual registers.
       if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
           !LiveInts->isNotInMIMap(MI)) {
@@ -714,20 +735,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       unsigned SubIdx = MO->getSubReg();
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        unsigned sr = Reg;
         if (SubIdx) {
-          unsigned s = TRI->getSubReg(Reg, SubIdx);
-          if (!s) {
-            report("Invalid subregister index for physical register",
-                   MO, MONum);
-            return;
-          }
-          sr = s;
+          report("Illegal subregister index for physical register", MO, MONum);
+          return;
         }
         if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) {
-          if (!DRC->contains(sr)) {
+          if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
-            *OS << TRI->getName(sr) << " is not a "
+            *OS << TRI->getName(Reg) << " is not a "
                 << DRC->getName() << " register.\n";
           }
         }
@@ -735,16 +750,35 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         // Virtual register.
         const TargetRegisterClass *RC = MRI->getRegClass(Reg);
         if (SubIdx) {
-          const TargetRegisterClass *SRC = RC->getSubRegisterRegClass(SubIdx);
+          const TargetRegisterClass *SRC =
+            TRI->getSubClassWithSubReg(RC, SubIdx);
           if (!SRC) {
             report("Invalid subregister index for virtual register", MO, MONum);
             *OS << "Register class " << RC->getName()
                 << " does not support subreg index " << SubIdx << "\n";
             return;
           }
-          RC = SRC;
+          if (RC != SRC) {
+            report("Invalid register class for subregister index", MO, MONum);
+            *OS << "Register class " << RC->getName()
+                << " does not fully support subreg index " << SubIdx << "\n";
+            return;
+          }
         }
         if (const TargetRegisterClass *DRC = TII->getRegClass(MCID,MONum,TRI)) {
+          if (SubIdx) {
+            const TargetRegisterClass *SuperRC =
+              TRI->getLargestLegalSuperClass(RC);
+            if (!SuperRC) {
+              report("No largest legal super class exists.", MO, MONum);
+              return;
+            }
+            DRC = TRI->getMatchingSuperRegClass(SuperRC, DRC, SubIdx);
+            if (!DRC) {
+              report("No matching super-reg register class.", MO, MONum);
+              return;
+            }
+          }
           if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
             *OS << "Expected a " << DRC->getName() << " register, but got a "
@@ -1161,18 +1195,8 @@ void MachineVerifier::verifyLiveIntervals() {
           SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI).getPrevSlot();
           const VNInfo *PVNI = LI.getVNInfoAt(PEnd);
 
-          if (VNI->isPHIDef() && VNI->def == LiveInts->getMBBStartIdx(MFI)) {
-            if (PVNI && !PVNI->hasPHIKill()) {
-              report("Value live out of predecessor doesn't have PHIKill", MF);
-              *OS << "Valno #" << PVNI->id << " live out of BB#"
-                  << (*PI)->getNumber() << '@' << PEnd
-                  << " doesn't have PHIKill, but Valno #" << VNI->id
-                  << " is PHIDef and defined at the beginning of BB#"
-                  << MFI->getNumber() << '@' << LiveInts->getMBBStartIdx(MFI)
-                  << " in " << LI << '\n';
-            }
+          if (VNI->isPHIDef() && VNI->def == LiveInts->getMBBStartIdx(MFI))
             continue;
-          }
 
           if (!PVNI) {
             report("Register not marked live out of predecessor", *PI);
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index af65f13..6994aa5 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -109,6 +109,9 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
   // Split critical edges to help the coalescer
   if (!DisableEdgeSplitting) {
     if (LiveVariables *LV = getAnalysisIfAvailable<LiveVariables>()) {
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index c523e39..bbc7ce2 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -295,7 +295,6 @@ bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI,
   if (!DefMI || !DefMI->getDesc().isBitcast())
     return false;
 
-  unsigned SrcDef = 0;
   unsigned SrcSrc = 0;
   NumDefs = DefMI->getDesc().getNumDefs();
   NumSrcs = DefMI->getDesc().getNumOperands() - NumDefs;
@@ -308,13 +307,13 @@ bool PeepholeOptimizer::OptimizeBitcastInstr(MachineInstr *MI,
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (MO.isDef())
-      SrcDef = Reg;
-    else if (SrcSrc)
-      // Multiple sources?
-      return false;
-    else
-      SrcSrc = Reg;
+    if (!MO.isDef()) {
+      if (SrcSrc)
+        // Multiple sources?
+        return false;
+      else
+        SrcSrc = Reg;
+    }
   }
 
   if (MRI->getRegClass(SrcSrc) != MRI->getRegClass(Def))
@@ -434,6 +433,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MCID.isBitcast()) {
         if (OptimizeBitcastInstr(MI, MBB)) {
           // MI is deleted.
+          LocalMIs.erase(MI);
           Changed = true;
           MII = First ? I->begin() : llvm::next(PMII);
           continue;
@@ -441,6 +441,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       } else if (MCID.isCompare()) {
         if (OptimizeCmpInstr(MI, MBB)) {
           // MI is deleted.
+          LocalMIs.erase(MI);
           Changed = true;
           MII = First ? I->begin() : llvm::next(PMII);
           continue;
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index c04d656..b1d8c97 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -125,8 +125,14 @@ bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &fn) {
             LiveVariables::VarInfo& vi = LV->getVarInfo(MO.getReg());
             vi.removeKill(MI);
           }
+          unsigned Reg = MI->getOperand(0).getReg();
           MI->eraseFromParent();
           Changed = true;
+
+          // A REG_SEQUENCE may have been expanded into partial definitions.
+          // If this was the last one, mark Reg as implicitly defined.
+          if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI->def_empty(Reg))
+            ImpDefRegs.insert(Reg);
           continue;
         }
       }
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index a901c5f..32c9325 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -54,6 +55,8 @@ INITIALIZE_PASS_END(PEI, "prologepilog",
 
 STATISTIC(NumVirtualFrameRegs, "Number of virtual frame regs encountered");
 STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+STATISTIC(NumBytesStackSpace,
+          "Number of bytes used for stack in all functions");
 
 /// createPrologEpilogCodeInserter - This function returns a pass that inserts
 /// prolog and epilog code, and eliminates abstract frame references.
@@ -677,7 +680,9 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   }
 
   // Update frame info to pretend that this is part of the stack...
-  MFI->setStackSize(Offset - LocalAreaOffset);
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI->setStackSize(StackSize);
+  NumBytesStackSpace += StackSize;
 }
 
 /// insertPrologEpilogCode - Scan the function for modified callee saved
@@ -696,6 +701,13 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
     if (!I->empty() && I->back().getDesc().isReturn())
       TFI.emitEpilogue(Fn, *I);
   }
+
+  // Emit additional code that is required to support segmented stacks, if
+  // we've been asked for it.  This, when linked with a runtime with support
+  // for segmented stacks (libgcc is one), will result in allocating stack
+  // space in small chunks instead of one large contiguous block.
+  if (EnableSegmentedStacks)
+    TFI.adjustForSegmentedStacks(Fn);
 }
 
 /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 5ea26ad..5496d69 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -20,7 +20,6 @@
 #include "RenderMachineFunction.h"
 #include "Spiller.h"
 #include "VirtRegMap.h"
-#include "RegisterCoalescer.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -160,7 +159,7 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveDebugVariables>();
   if (StrongPHIElim)
     AU.addRequiredID(StrongPHIEliminationID);
-  AU.addRequiredTransitive<RegisterCoalescer>();
+  AU.addRequiredTransitiveID(RegisterCoalescerPassID);
   AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
@@ -439,6 +438,7 @@ void RegAllocBase::addMBBLiveIns(MachineFunction *MF) {
     LiveIntervalUnion &LiveUnion = PhysReg2LiveUnion[PhysReg];
     if (LiveUnion.empty())
       continue;
+    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " live-in:");
     MachineFunction::iterator MBB = llvm::next(MF->begin());
     MachineFunction::iterator MFE = MF->end();
     SlotIndex Start, Stop;
@@ -449,6 +449,8 @@ void RegAllocBase::addMBBLiveIns(MachineFunction *MF) {
       if (SI.start() <= Start) {
         if (!MBB->isLiveIn(PhysReg))
           MBB->addLiveIn(PhysReg);
+        DEBUG(dbgs() << "\tBB#" << MBB->getNumber() << ':'
+                     << PrintReg(SI.value()->reg, TRI));
       } else if (SI.start() > Stop)
         MBB = Indexes->getMBBFromIndex(SI.start().getPrevIndex());
       if (++MBB == MFE)
@@ -456,6 +458,7 @@ void RegAllocBase::addMBBLiveIns(MachineFunction *MF) {
       tie(Start, Stop) = Indexes->getMBBRange(MBB);
       SI.advanceTo(Start);
     }
+    DEBUG(dbgs() << '\n');
   }
 }
 
@@ -495,8 +498,9 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
       // Found an available register.
       return PhysReg;
     }
+    Queries[interfReg].collectInterferingVRegs(1);
     LiveInterval *interferingVirtReg =
-      Queries[interfReg].firstInterference().liveUnionPos().value();
+      Queries[interfReg].interferingVRegs().front();
 
     // The current VirtReg must either be spillable, or one of its interferences
     // must have less spill weight.
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index e235e87..f54a2c8 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -22,7 +22,6 @@
 #include "SpillPlacement.h"
 #include "SplitKit.h"
 #include "VirtRegMap.h"
-#include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Function.h"
@@ -38,6 +37,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -51,6 +51,15 @@ STATISTIC(NumGlobalSplits, "Number of split global live ranges");
 STATISTIC(NumLocalSplits,  "Number of split local live ranges");
 STATISTIC(NumEvicted,      "Number of interferences evicted");
 
+static cl::opt<SplitEditor::ComplementSpillMode>
+SplitSpillMode("split-spill-mode", cl::Hidden,
+  cl::desc("Spill mode for splitting live ranges"),
+  cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+             clEnumValN(SplitEditor::SM_Size,  "size",  "Optimize for size"),
+             clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+             clEnumValEnd),
+  cl::init(SplitEditor::SM_Partition));
+
 static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
                                        createGreedyRegisterAllocator);
 
@@ -90,12 +99,26 @@ class RAGreedy : public MachineFunctionPass,
   // range splitting algorithm terminates, something that is otherwise hard to
   // ensure.
   enum LiveRangeStage {
-    RS_New,      ///< Never seen before.
-    RS_First,    ///< First time in the queue.
-    RS_Second,   ///< Second time in the queue.
-    RS_Global,   ///< Produced by global splitting.
-    RS_Local,    ///< Produced by local splitting.
-    RS_Spill     ///< Produced by spilling.
+    /// Newly created live range that has never been queued.
+    RS_New,
+
+    /// Only attempt assignment and eviction. Then requeue as RS_Split.
+    RS_Assign,
+
+    /// Attempt live range splitting if assignment is impossible.
+    RS_Split,
+
+    /// Attempt more aggressive live range splitting that is guaranteed to make
+    /// progress.  This is used for split products that may not be making
+    /// progress.
+    RS_Split2,
+
+    /// Live range will be spilled.  No more splitting will be attempted.
+    RS_Spill,
+
+    /// There is nothing more we can do to this live range.  Abort compilation
+    /// if it can't be assigned.
+    RS_Done
   };
 
   static const char *const StageName[];
@@ -157,17 +180,38 @@ class RAGreedy : public MachineFunctionPass,
 
   /// Global live range splitting candidate info.
   struct GlobalSplitCandidate {
+    // Register intended for assignment, or 0.
     unsigned PhysReg;
+
+    // SplitKit interval index for this candidate.
+    unsigned IntvIdx;
+
+    // Interference for PhysReg.
     InterferenceCache::Cursor Intf;
+
+    // Bundles where this candidate should be live.
     BitVector LiveBundles;
     SmallVector<unsigned, 8> ActiveBlocks;
 
     void reset(InterferenceCache &Cache, unsigned Reg) {
       PhysReg = Reg;
+      IntvIdx = 0;
       Intf.setPhysReg(Cache, Reg);
       LiveBundles.clear();
       ActiveBlocks.clear();
     }
+
+    // Set B[i] = C for every live bundle where B[i] was NoCand.
+    unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
+      unsigned Count = 0;
+      for (int i = LiveBundles.find_first(); i >= 0;
+           i = LiveBundles.find_next(i))
+        if (B[i] == NoCand) {
+          B[i] = C;
+          Count++;
+        }
+      return Count;
+    }
   };
 
   /// Candidate info for for each PhysReg in AllocationOrder.
@@ -175,6 +219,12 @@ class RAGreedy : public MachineFunctionPass,
   /// class.
   SmallVector<GlobalSplitCandidate, 32> GlobalCand;
 
+  enum { NoCand = ~0u };
+
+  /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
+  /// NoCand which indicates the stack interval.
+  SmallVector<unsigned, 32> BundleCand;
+
 public:
   RAGreedy();
 
@@ -208,8 +258,8 @@ private:
   void addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
   void growRegion(GlobalSplitCandidate &Cand);
   float calcGlobalSplitCost(GlobalSplitCandidate&);
-  void splitAroundRegion(LiveInterval&, GlobalSplitCandidate&,
-                         SmallVectorImpl<LiveInterval*>&);
+  bool calcCompactRegion(GlobalSplitCandidate&);
+  void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
   void calcGapWeights(unsigned, SmallVectorImpl<float>&);
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
   bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&);
@@ -222,6 +272,8 @@ private:
                     SmallVectorImpl<LiveInterval*>&, unsigned = ~0u);
   unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
                           SmallVectorImpl<LiveInterval*>&);
+  unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
+                         SmallVectorImpl<LiveInterval*>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
     SmallVectorImpl<LiveInterval*>&);
   unsigned trySplit(LiveInterval&, AllocationOrder&,
@@ -233,12 +285,12 @@ char RAGreedy::ID = 0;
 
 #ifndef NDEBUG
 const char *const RAGreedy::StageName[] = {
-  "RS_New",
-  "RS_First",
-  "RS_Second",
-  "RS_Global",
-  "RS_Local",
-  "RS_Spill"
+    "RS_New",
+    "RS_Assign",
+    "RS_Split",
+    "RS_Split2",
+    "RS_Spill",
+    "RS_Done"
 };
 #endif
 
@@ -278,7 +330,7 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveDebugVariables>();
   if (StrongPHIElim)
     AU.addRequiredID(StrongPHIEliminationID);
-  AU.addRequiredTransitive<RegisterCoalescer>();
+  AU.addRequiredTransitiveID(RegisterCoalescerPassID);
   AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
@@ -325,9 +377,15 @@ void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
 }
 
 void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+  // Cloning a register we haven't even heard about yet?  Just ignore it.
+  if (!ExtraRegInfo.inBounds(Old))
+    return;
+
   // LRE may clone a virtual register because dead code elimination causes it to
-  // be split into connected components. Ensure that the new register gets the
+  // be split into connected components. The new components are much smaller
+  // than the original, so they should get a new chance at being assigned.
   // same stage as the parent.
+  ExtraRegInfo[Old].Stage = RS_Assign;
   ExtraRegInfo.grow(New);
   ExtraRegInfo[New] = ExtraRegInfo[Old];
 }
@@ -350,16 +408,15 @@ void RAGreedy::enqueue(LiveInterval *LI) {
 
   ExtraRegInfo.grow(Reg);
   if (ExtraRegInfo[Reg].Stage == RS_New)
-    ExtraRegInfo[Reg].Stage = RS_First;
+    ExtraRegInfo[Reg].Stage = RS_Assign;
 
-  if (ExtraRegInfo[Reg].Stage == RS_Second)
+  if (ExtraRegInfo[Reg].Stage == RS_Split) {
     // Unsplit ranges that couldn't be allocated immediately are deferred until
-    // everything else has been allocated. Long ranges are allocated last so
-    // they are split against realistic interference.
-    Prio = (1u << 31) - Size;
-  else {
-    // Everything else is allocated in long->short order. Long ranges that don't
-    // fit should be spilled ASAP so they don't create interference.
+    // everything else has been allocated.
+    Prio = Size;
+  } else {
+    // Everything is allocated in long->short order. Long ranges that don't fit
+    // should be spilled (or split) ASAP so they don't create interference.
     Prio = (1u << 31) + Size;
 
     // Boost ranges that have a physical register hint.
@@ -442,7 +499,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
 /// @param BreaksHint True when B is already assigned to its preferred register.
 bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
                            LiveInterval &B, bool BreaksHint) {
-  bool CanSplit = getStage(B) <= RS_Second;
+  bool CanSplit = getStage(B) < RS_Spill;
 
   // Be fairly aggressive about following hints as long as the evictee can be
   // split.
@@ -487,7 +544,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
       if (TargetRegisterInfo::isPhysicalRegister(Intf->reg))
         return false;
       // Never evict spill products. They cannot split or spill.
-      if (getStage(*Intf) == RS_Spill)
+      if (getStage(*Intf) == RS_Done)
         return false;
       // Once a live range becomes small enough, it is urgent that we find a
       // register for it. This is indicated by an infinite spill weight. These
@@ -627,6 +684,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
     Intf.moveToBlock(BC.Number);
     BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
     BC.Exit = BI.LiveOut ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
+    BC.ChangesValue = BI.FirstDef;
 
     if (!Intf.hasInterference())
       continue;
@@ -638,9 +696,9 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
     if (BI.LiveIn) {
       if (Intf.first() <= Indexes->getMBBStartIdx(BC.Number))
         BC.Entry = SpillPlacement::MustSpill, ++Ins;
-      else if (Intf.first() < BI.FirstUse)
+      else if (Intf.first() < BI.FirstInstr)
         BC.Entry = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.first() < BI.LastUse)
+      else if (Intf.first() < BI.LastInstr)
         ++Ins;
     }
 
@@ -648,9 +706,9 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
     if (BI.LiveOut) {
       if (Intf.last() >= SA->getLastSplitPoint(BC.Number))
         BC.Exit = SpillPlacement::MustSpill, ++Ins;
-      else if (Intf.last() > BI.LastUse)
+      else if (Intf.last() > BI.LastInstr)
         BC.Exit = SpillPlacement::PrefSpill, ++Ins;
-      else if (Intf.last() > BI.FirstUse)
+      else if (Intf.last() > BI.FirstInstr)
         ++Ins;
     }
 
@@ -684,7 +742,7 @@ void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
       assert(T < GroupSize && "Array overflow");
       TBS[T] = Number;
       if (++T == GroupSize) {
-        SpillPlacer->addLinks(ArrayRef<unsigned>(TBS, T));
+        SpillPlacer->addLinks(makeArrayRef(TBS, T));
         T = 0;
       }
       continue;
@@ -714,7 +772,7 @@ void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
 
   ArrayRef<SpillPlacement::BlockConstraint> Array(BCS, B);
   SpillPlacer->addConstraints(Array);
-  SpillPlacer->addLinks(ArrayRef<unsigned>(TBS, T));
+  SpillPlacer->addLinks(makeArrayRef(TBS, T));
 }
 
 void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
@@ -749,8 +807,16 @@ void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
     // Any new blocks to add?
     if (ActiveBlocks.size() == AddedTo)
       break;
-    addThroughConstraints(Cand.Intf,
-                          ArrayRef<unsigned>(ActiveBlocks).slice(AddedTo));
+
+    // Compute through constraints from the interference, or assume that all
+    // through blocks prefer spilling when forming compact regions.
+    ArrayRef<unsigned> NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
+    if (Cand.PhysReg)
+      addThroughConstraints(Cand.Intf, NewBlocks);
+    else
+      // Provide a strong negative bias on through blocks to prevent unwanted
+      // liveness on loop backedges.
+      SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
     AddedTo = ActiveBlocks.size();
 
     // Perhaps iterating can enable more bundles?
@@ -759,11 +825,55 @@ void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
   DEBUG(dbgs() << ", v=" << Visited);
 }
 
+/// calcCompactRegion - Compute the set of edge bundles that should be live
+/// when splitting the current live range into compact regions.  Compact
+/// regions can be computed without looking at interference.  They are the
+/// regions formed by removing all the live-through blocks from the live range.
+///
+/// Returns false if the current live range is already compact, or if the
+/// compact regions would form single block regions anyway.
+bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
+  // Without any through blocks, the live range is already compact.
+  if (!SA->getNumThroughBlocks())
+    return false;
+
+  // Compact regions don't correspond to any physreg.
+  Cand.reset(IntfCache, 0);
+
+  DEBUG(dbgs() << "Compact region bundles");
+
+  // Use the spill placer to determine the live bundles. GrowRegion pretends
+  // that all the through blocks have interference when PhysReg is unset.
+  SpillPlacer->prepare(Cand.LiveBundles);
+
+  // The static split cost will be zero since Cand.Intf reports no interference.
+  float Cost;
+  if (!addSplitConstraints(Cand.Intf, Cost)) {
+    DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  growRegion(Cand);
+  SpillPlacer->finish();
+
+  if (!Cand.LiveBundles.any()) {
+    DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  DEBUG({
+    for (int i = Cand.LiveBundles.find_first(); i>=0;
+         i = Cand.LiveBundles.find_next(i))
+    dbgs() << " EB#" << i;
+    dbgs() << ".\n";
+  });
+  return true;
+}
+
 /// calcSpillCost - Compute how expensive it would be to split the live range in
 /// SA around all use blocks instead of forming bundle regions.
 float RAGreedy::calcSpillCost() {
   float Cost = 0;
-  const LiveInterval &LI = SA->getParent();
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
@@ -772,16 +882,8 @@ float RAGreedy::calcSpillCost() {
     Cost += SpillPlacer->getBlockFrequency(Number);
 
     // Unless the value is redefined in the block.
-    if (BI.LiveIn && BI.LiveOut) {
-      SlotIndex Start, Stop;
-      tie(Start, Stop) = Indexes->getMBBRange(Number);
-      LiveInterval::const_iterator I = LI.find(Start);
-      assert(I != LI.end() && "Expected live-in value");
-      // Is there a different live-out value? If so, we need an extra spill
-      // instruction.
-      if (I->end < Stop)
-        Cost += SpillPlacer->getBlockFrequency(Number);
-    }
+    if (BI.LiveIn && BI.LiveOut && BI.FirstDef)
+      Cost += SpillPlacer->getBlockFrequency(Number);
   }
   return Cost;
 }
@@ -828,81 +930,115 @@ float RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
   return GlobalCost;
 }
 
-/// splitAroundRegion - Split VirtReg around the region determined by
-/// LiveBundles. Make an effort to avoid interference from PhysReg.
+/// splitAroundRegion - Split the current live range around the regions
+/// determined by BundleCand and GlobalCand.
 ///
-/// The 'register' interval is going to contain as many uses as possible while
-/// avoiding interference. The 'stack' interval is the complement constructed by
-/// SplitEditor. It will contain the rest.
+/// Before calling this function, GlobalCand and BundleCand must be initialized
+/// so each bundle is assigned to a valid candidate, or NoCand for the
+/// stack-bound bundles.  The shared SA/SE SplitAnalysis and SplitEditor
+/// objects must be initialized for the current live range, and intervals
+/// created for the used candidates.
 ///
-void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
-                                 GlobalSplitCandidate &Cand,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
-  const BitVector &LiveBundles = Cand.LiveBundles;
-
-  DEBUG({
-    dbgs() << "Splitting around region for " << PrintReg(Cand.PhysReg, TRI)
-           << " with bundles";
-    for (int i = LiveBundles.find_first(); i>=0; i = LiveBundles.find_next(i))
-      dbgs() << " EB#" << i;
-    dbgs() << ".\n";
-  });
-
-  InterferenceCache::Cursor &Intf = Cand.Intf;
-  LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
-  SE->reset(LREdit);
-
-  // Create the main cross-block interval.
-  const unsigned MainIntv = SE->openIntv();
+/// @param LREdit    The LiveRangeEdit object handling the current split.
+/// @param UsedCands List of used GlobalCand entries. Every BundleCand value
+///                  must appear in this list.
+void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
+                                 ArrayRef<unsigned> UsedCands) {
+  // These are the intervals created for new global ranges. We may create more
+  // intervals for local ranges.
+  const unsigned NumGlobalIntvs = LREdit.size();
+  DEBUG(dbgs() << "splitAroundRegion with " << NumGlobalIntvs << " globals.\n");
+  assert(NumGlobalIntvs && "No global intervals configured");
+
+  // Isolate even single instructions when dealing with a proper sub-class.
+  // That guarantees register class inflation for the stack interval because it
+  // is all copies.
+  unsigned Reg = SA->getParent().reg;
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
 
   // First handle all the blocks with uses.
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
-    bool RegIn  = BI.LiveIn &&
-                  LiveBundles[Bundles->getBundle(BI.MBB->getNumber(), 0)];
-    bool RegOut = BI.LiveOut &&
-                  LiveBundles[Bundles->getBundle(BI.MBB->getNumber(), 1)];
+    unsigned Number = BI.MBB->getNumber();
+    unsigned IntvIn = 0, IntvOut = 0;
+    SlotIndex IntfIn, IntfOut;
+    if (BI.LiveIn) {
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+    }
+    if (BI.LiveOut) {
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+    }
 
     // Create separate intervals for isolated blocks with multiple uses.
-    if (!RegIn && !RegOut) {
+    if (!IntvIn && !IntvOut) {
       DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " isolated.\n");
-      if (!BI.isOneInstr()) {
+      if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
         SE->splitSingleBlock(BI);
-        SE->selectIntv(MainIntv);
-      }
       continue;
     }
 
-    Intf.moveToBlock(BI.MBB->getNumber());
-
-    if (RegIn && RegOut)
-      SE->splitLiveThroughBlock(BI.MBB->getNumber(),
-                                MainIntv, Intf.first(),
-                                MainIntv, Intf.last());
-    else if (RegIn)
-      SE->splitRegInBlock(BI, MainIntv, Intf.first());
+    if (IntvIn && IntvOut)
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    else if (IntvIn)
+      SE->splitRegInBlock(BI, IntvIn, IntfIn);
     else
-      SE->splitRegOutBlock(BI, MainIntv, Intf.last());
+      SE->splitRegOutBlock(BI, IntvOut, IntfOut);
   }
 
-  // Handle live-through blocks.
-  for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
-    unsigned Number = Cand.ActiveBlocks[i];
-    bool RegIn  = LiveBundles[Bundles->getBundle(Number, 0)];
-    bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)];
-    if (!RegIn && !RegOut)
-      continue;
-    Intf.moveToBlock(Number);
-    SE->splitLiveThroughBlock(Number, RegIn  ? MainIntv : 0, Intf.first(),
-                                      RegOut ? MainIntv : 0, Intf.last());
+  // Handle live-through blocks. The relevant live-through blocks are stored in
+  // the ActiveBlocks list with each candidate. We need to filter out
+  // duplicates.
+  BitVector Todo = SA->getThroughBlocks();
+  for (unsigned c = 0; c != UsedCands.size(); ++c) {
+    ArrayRef<unsigned> Blocks = GlobalCand[UsedCands[c]].ActiveBlocks;
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      unsigned Number = Blocks[i];
+      if (!Todo.test(Number))
+        continue;
+      Todo.reset(Number);
+
+      unsigned IntvIn = 0, IntvOut = 0;
+      SlotIndex IntfIn, IntfOut;
+
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+      if (!IntvIn && !IntvOut)
+        continue;
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    }
   }
 
   ++NumGlobalSplits;
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  DebugVars->splitRegister(Reg, LREdit.regs());
 
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   unsigned OrigBlocks = SA->getNumLiveBlocks();
@@ -922,18 +1058,18 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
     // Remainder interval. Don't try splitting again, spill if it doesn't
     // allocate.
     if (IntvMap[i] == 0) {
-      setStage(Reg, RS_Global);
+      setStage(Reg, RS_Spill);
       continue;
     }
 
-    // Main interval. Allow repeated splitting as long as the number of live
+    // Global intervals. Allow repeated splitting as long as the number of live
     // blocks is strictly decreasing.
-    if (IntvMap[i] == MainIntv) {
+    if (IntvMap[i] < NumGlobalIntvs) {
       if (SA->countLiveBlocks(&Reg) >= OrigBlocks) {
         DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
                      << " blocks as original.\n");
         // Don't allow repeated splitting as a safe guard against looping.
-        setStage(Reg, RS_Global);
+        setStage(Reg, RS_Split2);
       }
       continue;
     }
@@ -948,11 +1084,23 @@ void RAGreedy::splitAroundRegion(LiveInterval &VirtReg,
 
 unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
                                   SmallVectorImpl<LiveInterval*> &NewVRegs) {
-  float BestCost = Hysteresis * calcSpillCost();
-  DEBUG(dbgs() << "Cost of isolating all blocks = " << BestCost << '\n');
-  const unsigned NoCand = ~0u;
-  unsigned BestCand = NoCand;
   unsigned NumCands = 0;
+  unsigned BestCand = NoCand;
+  float BestCost;
+  SmallVector<unsigned, 8> UsedCands;
+
+  // Check if we can split this live range around a compact region.
+  bool HasCompact = calcCompactRegion(GlobalCand.front());
+  if (HasCompact) {
+    // Yes, keep GlobalCand[0] as the compact region candidate.
+    NumCands = 1;
+    BestCost = HUGE_VALF;
+  } else {
+    // No benefit from the compact region, our fallback will be per-block
+    // splitting. Make sure we find a solution that is cheaper than spilling.
+    BestCost = Hysteresis * calcSpillCost();
+    DEBUG(dbgs() << "Cost of isolating all blocks = " << BestCost << '\n');
+  }
 
   Order.rewind();
   while (unsigned PhysReg = Order.next()) {
@@ -962,7 +1110,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       unsigned WorstCount = ~0u;
       unsigned Worst = 0;
       for (unsigned i = 0; i != NumCands; ++i) {
-        if (i == BestCand)
+        if (i == BestCand || !GlobalCand[i].PhysReg)
           continue;
         unsigned Count = GlobalCand[i].LiveBundles.count();
         if (Count < WorstCount)
@@ -1019,15 +1167,94 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     ++NumCands;
   }
 
-  if (BestCand == NoCand)
+  // No solutions found, fall back to single block splitting.
+  if (!HasCompact && BestCand == NoCand)
     return 0;
 
-  splitAroundRegion(VirtReg, GlobalCand[BestCand], NewVRegs);
+  // Prepare split editor.
+  LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
+  SE->reset(LREdit, SplitSpillMode);
+
+  // Assign all edge bundles to the preferred candidate, or NoCand.
+  BundleCand.assign(Bundles->getNumBundles(), NoCand);
+
+  // Assign bundles for the best candidate region.
+  if (BestCand != NoCand) {
+    GlobalSplitCandidate &Cand = GlobalCand[BestCand];
+    if (unsigned B = Cand.getBundles(BundleCand, BestCand)) {
+      UsedCands.push_back(BestCand);
+      Cand.IntvIdx = SE->openIntv();
+      DEBUG(dbgs() << "Split for " << PrintReg(Cand.PhysReg, TRI) << " in "
+                   << B << " bundles, intv " << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  // Assign bundles for the compact region.
+  if (HasCompact) {
+    GlobalSplitCandidate &Cand = GlobalCand.front();
+    assert(!Cand.PhysReg && "Compact region has no physreg");
+    if (unsigned B = Cand.getBundles(BundleCand, 0)) {
+      UsedCands.push_back(0);
+      Cand.IntvIdx = SE->openIntv();
+      DEBUG(dbgs() << "Split for compact region in " << B << " bundles, intv "
+                   << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  splitAroundRegion(LREdit, UsedCands);
   return 0;
 }
 
 
 //===----------------------------------------------------------------------===//
+//                            Per-Block Splitting
+//===----------------------------------------------------------------------===//
+
+/// tryBlockSplit - Split a global live range around every block with uses. This
+/// creates a lot of local live ranges, that will be split by tryLocalSplit if
+/// they don't allocate.
+unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+  assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
+  unsigned Reg = VirtReg.reg;
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
+  LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
+  SE->reset(LREdit, SplitSpillMode);
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
+      SE->splitSingleBlock(BI);
+  }
+  // No blocks were split.
+  if (LREdit.empty())
+    return 0;
+
+  // We did split for some blocks.
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+
+  // Tell LiveDebugVariables about the new ranges.
+  DebugVars->splitRegister(Reg, LREdit.regs());
+
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+
+  // Sort out the new intervals created by splitting. The remainder interval
+  // goes straight to spilling, the new local ranges get to stay RS_New.
+  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
+    LiveInterval &LI = *LREdit.get(i);
+    if (getStage(LI) == RS_New && IntvMap[i] == 0)
+      setStage(LI, RS_Spill);
+  }
+
+  if (VerifyEnabled)
+    MF->verify(this, "After splitting live range around basic blocks");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
 //                             Local Splitting
 //===----------------------------------------------------------------------===//
 
@@ -1045,8 +1272,10 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
   const unsigned NumGaps = Uses.size()-1;
 
   // Start and end points for the interference check.
-  SlotIndex StartIdx = BI.LiveIn ? BI.FirstUse.getBaseIndex() : BI.FirstUse;
-  SlotIndex StopIdx = BI.LiveOut ? BI.LastUse.getBoundaryIndex() : BI.LastUse;
+  SlotIndex StartIdx =
+    BI.LiveIn ? BI.FirstInstr.getBaseIndex() : BI.FirstInstr;
+  SlotIndex StopIdx =
+    BI.LiveOut ? BI.LastInstr.getBoundaryIndex() : BI.LastInstr;
 
   GapWeight.assign(NumGaps, 0.0f);
 
@@ -1056,8 +1285,8 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
            .checkInterference())
       continue;
 
-    // We know that VirtReg is a continuous interval from FirstUse to LastUse,
-    // so we don't need InterferenceQuery.
+    // We know that VirtReg is a continuous interval from FirstInstr to
+    // LastInstr, so we don't need InterferenceQuery.
     //
     // Interference that overlaps an instruction is counted in both gaps
     // surrounding the instruction. The exception is interference before
@@ -1097,8 +1326,8 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   // while only covering a single block - A phi-def can use undef values from
   // predecessors, and the block could be a single-block loop.
   // We don't bother doing anything clever about such a case, we simply assume
-  // that the interval is continuous from FirstUse to LastUse. We should make
-  // sure that we don't do anything illegal to such an interval, though.
+  // that the interval is continuous from FirstInstr to LastInstr. We should
+  // make sure that we don't do anything illegal to such an interval, though.
 
   const SmallVectorImpl<SlotIndex> &Uses = SA->UseSlots;
   if (Uses.size() <= 2)
@@ -1120,17 +1349,17 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   //
   // Instead we use these rules:
   //
-  // 1. Allow any split for ranges with getStage() < RS_Local. (Except for the
+  // 1. Allow any split for ranges with getStage() < RS_Split2. (Except for the
   //    noop split, of course).
-  // 2. Require progress be made for ranges with getStage() >= RS_Local. All
+  // 2. Require progress be made for ranges with getStage() == RS_Split2. All
   //    the new ranges must have fewer instructions than before the split.
-  // 3. New ranges with the same number of instructions are marked RS_Local,
+  // 3. New ranges with the same number of instructions are marked RS_Split2,
   //    smaller ranges are marked RS_New.
   //
   // These rules allow a 3 -> 2+3 split once, which we need. They also prevent
   // excessive splitting and infinite loops.
   //
-  bool ProgressRequired = getStage(VirtReg) >= RS_Local;
+  bool ProgressRequired = getStage(VirtReg) >= RS_Split2;
 
   // Best split candidate.
   unsigned BestBefore = NumGaps;
@@ -1249,7 +1478,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
 
   // If the new range has the same number of instructions as before, mark it as
-  // RS_Local so the next split will be forced to make progress. Otherwise,
+  // RS_Split2 so the next split will be forced to make progress. Otherwise,
   // leave the new intervals as RS_New so they can compete.
   bool LiveBefore = BestBefore != 0 || BI.LiveIn;
   bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
@@ -1259,7 +1488,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     assert(!ProgressRequired && "Didn't make progress when it was required.");
     for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
       if (IntvMap[i] == 1) {
-        setStage(*LREdit.get(i), RS_Local);
+        setStage(*LREdit.get(i), RS_Split2);
         DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg));
       }
     DEBUG(dbgs() << '\n');
@@ -1278,6 +1507,10 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
 unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
                             SmallVectorImpl<LiveInterval*>&NewVRegs) {
+  // Ranges must be Split2 or less.
+  if (getStage(VirtReg) >= RS_Spill)
+    return 0;
+
   // Local intervals are handled separately.
   if (LIS->intervalIsInOneMBB(VirtReg)) {
     NamedRegionTimer T("Local Splitting", TimerGroupName, TimePassesIsEnabled);
@@ -1287,11 +1520,6 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   NamedRegionTimer T("Global Splitting", TimerGroupName, TimePassesIsEnabled);
 
-  // Don't iterate global splitting.
-  // Move straight to spilling if this range was produced by a global split.
-  if (getStage(VirtReg) >= RS_Global)
-    return 0;
-
   SA->analyze(&VirtReg);
 
   // FIXME: SplitAnalysis may repair broken live ranges coming from the
@@ -1305,24 +1533,17 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
       return PhysReg;
   }
 
-  // First try to split around a region spanning multiple blocks.
-  unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
-  if (PhysReg || !NewVRegs.empty())
-    return PhysReg;
-
-  // Then isolate blocks with multiple uses.
-  SplitAnalysis::BlockPtrSet Blocks;
-  if (SA->getMultiUseBlocks(Blocks)) {
-    LiveRangeEdit LREdit(VirtReg, NewVRegs, this);
-    SE->reset(LREdit);
-    SE->splitSingleBlocks(Blocks);
-    setStage(NewVRegs.begin(), NewVRegs.end(), RS_Global);
-    if (VerifyEnabled)
-      MF->verify(this, "After splitting live range around basic blocks");
+  // First try to split around a region spanning multiple blocks. RS_Split2
+  // ranges already made dubious progress with region splitting, so they go
+  // straight to single block splitting.
+  if (getStage(VirtReg) < RS_Split2) {
+    unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
   }
 
-  // Don't assign any physregs.
-  return 0;
+  // Then isolate blocks.
+  return tryBlockSplit(VirtReg, Order, NewVRegs);
 }
 
 
@@ -1342,9 +1563,9 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
                << " Cascade " << ExtraRegInfo[VirtReg.reg].Cascade << '\n');
 
   // Try to evict a less worthy live range, but only for ranges from the primary
-  // queue. The RS_Second ranges already failed to do this, and they should not
+  // queue. The RS_Split ranges already failed to do this, and they should not
   // get a second chance until they have been split.
-  if (Stage != RS_Second)
+  if (Stage != RS_Split)
     if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs))
       return PhysReg;
 
@@ -1353,8 +1574,8 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
   // The first time we see a live range, don't try to split or spill.
   // Wait until the second time, when all smaller ranges have been allocated.
   // This gives a better picture of the interference to split around.
-  if (Stage == RS_First) {
-    setStage(VirtReg, RS_Second);
+  if (Stage < RS_Split) {
+    setStage(VirtReg, RS_Split);
     DEBUG(dbgs() << "wait for second round\n");
     NewVRegs.push_back(&VirtReg);
     return 0;
@@ -1362,7 +1583,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 
   // If we couldn't allocate a register from spilling, there is probably some
   // invalid inline assembly. The base class wil report it.
-  if (Stage >= RS_Spill || !VirtReg.isSpillable())
+  if (Stage >= RS_Done || !VirtReg.isSpillable())
     return ~0u;
 
   // Try splitting VirtReg or interferences.
@@ -1374,7 +1595,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
   NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
   LiveRangeEdit LRE(VirtReg, NewVRegs, this);
   spiller().spill(LRE);
-  setStage(NewVRegs.begin(), NewVRegs.end(), RS_Spill);
+  setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
 
   if (VerifyEnabled)
     MF->verify(this, "After spilling");
@@ -1408,6 +1629,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   NextCascade = 1;
   IntfCache.init(MF, &PhysReg2LiveUnion[0], Indexes, TRI);
+  GlobalCand.resize(32);  // This will grow as needed.
 
   allocatePhysRegs();
   addMBBLiveIns(MF);
@@ -1420,7 +1642,10 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   }
 
   // Write out new DBG_VALUE instructions.
-  DebugVars->emitDebugValues(VRM);
+  {
+    NamedRegionTimer T("Emit Debug Info", TimerGroupName, TimePassesIsEnabled);
+    DebugVars->emitDebugValues(VRM);
+  }
 
   // The pass output is in VirtRegMap. Release all the transient data.
   releaseMemory();
diff --git a/lib/CodeGen/RegAllocLinearScan.cpp b/lib/CodeGen/RegAllocLinearScan.cpp
index 0dd3c598..ce3fb90 100644
--- a/lib/CodeGen/RegAllocLinearScan.cpp
+++ b/lib/CodeGen/RegAllocLinearScan.cpp
@@ -18,7 +18,6 @@
 #include "VirtRegRewriter.h"
 #include "RegisterClassInfo.h"
 #include "Spiller.h"
-#include "RegisterCoalescer.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
@@ -209,7 +208,7 @@ namespace {
         AU.addRequiredID(StrongPHIEliminationID);
       // Make sure PassManager knows which analyses to make available
       // to coalescing and which analyses coalescing invalidates.
-      AU.addRequiredTransitive<RegisterCoalescer>();
+      AU.addRequiredTransitiveID(RegisterCoalescerPassID);
       AU.addRequired<CalculateSpillWeights>();
       AU.addRequiredID(LiveStacksID);
       AU.addPreservedID(LiveStacksID);
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 72230d4..0d2cf2d 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -450,7 +450,7 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.addPreserved<SlotIndexes>();
   au.addRequired<LiveIntervals>();
   //au.addRequiredID(SplitCriticalEdgesID);
-  au.addRequired<RegisterCoalescer>();
+  au.addRequiredID(RegisterCoalescerPassID);
   if (customPassID)
     au.addRequiredID(*customPassID);
   au.addRequired<CalculateSpillWeights>();
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 5a77e47..786d279 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -99,11 +99,16 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   // CSR aliases go after the volatile registers, preserve the target's order.
   std::copy(CSRAlias.begin(), CSRAlias.end(), &RCI.Order[N]);
 
+  // Check if RC is a proper sub-class.
+  if (const TargetRegisterClass *Super = TRI->getLargestLegalSuperClass(RC))
+    if (Super != RC && getNumAllocatableRegs(Super) > RCI.NumRegs)
+      RCI.ProperSubClass = true;
+
   DEBUG({
     dbgs() << "AllocationOrder(" << RC->getName() << ") = [";
     for (unsigned I = 0; I != RCI.NumRegs; ++I)
       dbgs() << ' ' << PrintReg(RCI.Order[I], TRI);
-    dbgs() << " ]\n";
+    dbgs() << (RCI.ProperSubClass ? " ] (sub-class)\n" : " ]\n");
   });
 
   // RCI is now up-to-date.
diff --git a/lib/CodeGen/RegisterClassInfo.h b/lib/CodeGen/RegisterClassInfo.h
index d21fd67..2c14070 100644
--- a/lib/CodeGen/RegisterClassInfo.h
+++ b/lib/CodeGen/RegisterClassInfo.h
@@ -28,11 +28,12 @@ class RegisterClassInfo {
   struct RCInfo {
     unsigned Tag;
     unsigned NumRegs;
+    bool ProperSubClass;
     OwningArrayPtr<unsigned> Order;
 
-    RCInfo() : Tag(0), NumRegs(0) {}
+    RCInfo() : Tag(0), NumRegs(0), ProperSubClass(false) {}
     operator ArrayRef<unsigned>() const {
-      return ArrayRef<unsigned>(Order.get(), NumRegs);
+      return makeArrayRef(Order.get(), NumRegs);
     }
   };
 
@@ -87,6 +88,16 @@ public:
     return get(RC);
   }
 
+  /// isProperSubClass - Returns true if RC has a legal super-class with more
+  /// allocatable registers.
+  ///
+  /// Register classes like GR32_NOSP are not proper sub-classes because %esp
+  /// is not allocatable.  Similarly, tGPR is not a proper sub-class in Thumb
+  /// mode because the GPR super-class is not legal.
+  bool isProperSubClass(const TargetRegisterClass *RC) const {
+    return get(RC).ProperSubClass;
+  }
+
   /// getLastCalleeSavedAlias - Returns the last callee saved register that
   /// overlaps PhysReg, or 0 if Reg doesn't overlap a CSR.
   unsigned getLastCalleeSavedAlias(unsigned PhysReg) const {
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index b91f92c..9b414d6 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -15,8 +15,9 @@
 
 #define DEBUG_TYPE "regcoalescing"
 #include "RegisterCoalescer.h"
-#include "VirtRegMap.h"
 #include "LiveDebugVariables.h"
+#include "RegisterClassInfo.h"
+#include "VirtRegMap.h"
 
 #include "llvm/Pass.h"
 #include "llvm/Value.h"
@@ -54,6 +55,7 @@ STATISTIC(numExtends  , "Number of copies extended");
 STATISTIC(NumReMats   , "Number of instructions re-materialized");
 STATISTIC(numPeep     , "Number of identity moves eliminated after coalescing");
 STATISTIC(numAborts   , "Number of times interval joining aborted");
+STATISTIC(NumInflated , "Number of register classes inflated");
 
 static cl::opt<bool>
 EnableJoining("join-liveintervals",
@@ -75,6 +77,128 @@ VerifyCoalescing("verify-coalescing",
          cl::desc("Verify machine instrs before and after register coalescing"),
          cl::Hidden);
 
+namespace {
+  class RegisterCoalescer : public MachineFunctionPass {
+    MachineFunction* MF;
+    MachineRegisterInfo* MRI;
+    const TargetMachine* TM;
+    const TargetRegisterInfo* TRI;
+    const TargetInstrInfo* TII;
+    LiveIntervals *LIS;
+    LiveDebugVariables *LDV;
+    const MachineLoopInfo* Loops;
+    AliasAnalysis *AA;
+    RegisterClassInfo RegClassInfo;
+
+    /// JoinedCopies - Keep track of copies eliminated due to coalescing.
+    ///
+    SmallPtrSet<MachineInstr*, 32> JoinedCopies;
+
+    /// ReMatCopies - Keep track of copies eliminated due to remat.
+    ///
+    SmallPtrSet<MachineInstr*, 32> ReMatCopies;
+
+    /// ReMatDefs - Keep track of definition instructions which have
+    /// been remat'ed.
+    SmallPtrSet<MachineInstr*, 8> ReMatDefs;
+
+    /// joinIntervals - join compatible live intervals
+    void joinIntervals();
+
+    /// CopyCoalesceInMBB - Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into the "TryAgain" list.
+    void CopyCoalesceInMBB(MachineBasicBlock *MBB,
+                           std::vector<MachineInstr*> &TryAgain);
+
+    /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
+    /// which are the src/dst of the copy instruction CopyMI.  This returns
+    /// true if the copy was successfully coalesced away. If it is not
+    /// currently possible to coalesce this interval, but it may be possible if
+    /// other things get coalesced, then it returns true by reference in
+    /// 'Again'.
+    bool JoinCopy(MachineInstr *TheCopy, bool &Again);
+
+    /// JoinIntervals - Attempt to join these two intervals.  On failure, this
+    /// returns false.  The output "SrcInt" will not have been modified, so we
+    /// can use this information below to update aliases.
+    bool JoinIntervals(CoalescerPair &CP);
+
+    /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
+    /// the source value number is defined by a copy from the destination reg
+    /// see if we can merge these two destination reg valno# into a single
+    /// value number, eliminating a copy.
+    bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+
+    /// HasOtherReachingDefs - Return true if there are definitions of IntB
+    /// other than BValNo val# that can reach uses of AValno val# of IntA.
+    bool HasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+                              VNInfo *AValNo, VNInfo *BValNo);
+
+    /// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy.
+    /// If the source value number is defined by a commutable instruction and
+    /// its other operand is coalesced to the copy dest register, see if we
+    /// can transform the copy into a noop by commuting the definition.
+    bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
+
+    /// ReMaterializeTrivialDef - If the source of a copy is defined by a
+    /// trivial computation, replace the copy by rematerialize the definition.
+    /// If PreserveSrcInt is true, make sure SrcInt is valid after the call.
+    bool ReMaterializeTrivialDef(LiveInterval &SrcInt, bool PreserveSrcInt,
+                                 unsigned DstReg, MachineInstr *CopyMI);
+
+    /// shouldJoinPhys - Return true if a physreg copy should be joined.
+    bool shouldJoinPhys(CoalescerPair &CP);
+
+    /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
+    /// two virtual registers from different register classes.
+    bool isWinToJoinCrossClass(unsigned SrcReg,
+                               unsigned DstReg,
+                               const TargetRegisterClass *SrcRC,
+                               const TargetRegisterClass *DstRC,
+                               const TargetRegisterClass *NewRC);
+
+    /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
+    /// update the subregister number if it is not zero. If DstReg is a
+    /// physical register and the existing subregister number of the def / use
+    /// being updated is not zero, make sure to set it to the correct physical
+    /// subregister.
+    void UpdateRegDefsUses(const CoalescerPair &CP);
+
+    /// RemoveDeadDef - If a def of a live interval is now determined dead,
+    /// remove the val# it defines. If the live interval becomes empty, remove
+    /// it as well.
+    bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
+
+    /// RemoveCopyFlag - If DstReg is no longer defined by CopyMI, clear the
+    /// VNInfo copy flag for DstReg and all aliases.
+    void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI);
+
+    /// markAsJoined - Remember that CopyMI has already been joined.
+    void markAsJoined(MachineInstr *CopyMI);
+
+    /// eliminateUndefCopy - Handle copies of undef values.
+    bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    RegisterCoalescer() : MachineFunctionPass(ID) {
+      initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+    virtual void releaseMemory();
+
+    /// runOnMachineFunction - pass entry point
+    virtual bool runOnMachineFunction(MachineFunction&);
+
+    /// print - Implement the dump method.
+    virtual void print(raw_ostream &O, const Module* = 0) const;
+  };
+} /// end anonymous namespace
+
+char &llvm::RegisterCoalescerPassID = RegisterCoalescer::ID;
+
 INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing",
                       "Simple Register Coalescing", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
@@ -116,14 +240,14 @@ static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
 }
 
 bool CoalescerPair::setRegisters(const MachineInstr *MI) {
-  srcReg_ = dstReg_ = subIdx_ = 0;
-  newRC_ = 0;
-  flipped_ = crossClass_ = false;
+  SrcReg = DstReg = SubIdx = 0;
+  NewRC = 0;
+  Flipped = CrossClass = false;
 
   unsigned Src, Dst, SrcSub, DstSub;
-  if (!isMoveInstr(tri_, MI, Src, Dst, SrcSub, DstSub))
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
     return false;
-  partial_ = SrcSub || DstSub;
+  Partial = SrcSub || DstSub;
 
   // If one register is a physreg, it must be Dst.
   if (TargetRegisterInfo::isPhysicalRegister(Src)) {
@@ -131,7 +255,7 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
       return false;
     std::swap(Src, Dst);
     std::swap(SrcSub, DstSub);
-    flipped_ = true;
+    Flipped = true;
   }
 
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
@@ -139,14 +263,14 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
   if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
     // Eliminate DstSub on a physreg.
     if (DstSub) {
-      Dst = tri_.getSubReg(Dst, DstSub);
+      Dst = TRI.getSubReg(Dst, DstSub);
       if (!Dst) return false;
       DstSub = 0;
     }
 
     // Eliminate SrcSub by picking a corresponding Dst superregister.
     if (SrcSub) {
-      Dst = tri_.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
+      Dst = TRI.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
       if (!Dst) return false;
       SrcSub = 0;
     } else if (!MRI.getRegClass(Src)->contains(Dst)) {
@@ -164,7 +288,7 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
         return false;
       const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
       const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
-      if (!getCommonSubClass(DstRC, SrcRC))
+      if (!TRI.getCommonSubClass(DstRC, SrcRC))
         return false;
       SrcSub = DstSub = 0;
     }
@@ -174,36 +298,36 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
       std::swap(Src, Dst);
       DstSub = SrcSub;
       SrcSub = 0;
-      assert(!flipped_ && "Unexpected flip");
-      flipped_ = true;
+      assert(!Flipped && "Unexpected flip");
+      Flipped = true;
     }
 
     // Find the new register class.
     const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
     if (DstSub)
-      newRC_ = tri_.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
     else
-      newRC_ = getCommonSubClass(DstRC, SrcRC);
-    if (!newRC_)
+      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+    if (!NewRC)
       return false;
-    crossClass_ = newRC_ != DstRC || newRC_ != SrcRC;
+    CrossClass = NewRC != DstRC || NewRC != SrcRC;
   }
   // Check our invariants
   assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
   assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
          "Cannot have a physical SubIdx");
-  srcReg_ = Src;
-  dstReg_ = Dst;
-  subIdx_ = DstSub;
+  SrcReg = Src;
+  DstReg = Dst;
+  SubIdx = DstSub;
   return true;
 }
 
 bool CoalescerPair::flip() {
-  if (subIdx_ || TargetRegisterInfo::isPhysicalRegister(dstReg_))
+  if (SubIdx || TargetRegisterInfo::isPhysicalRegister(DstReg))
     return false;
-  std::swap(srcReg_, dstReg_);
-  flipped_ = !flipped_;
+  std::swap(SrcReg, DstReg);
+  Flipped = !Flipped;
   return true;
 }
 
@@ -211,36 +335,36 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   if (!MI)
     return false;
   unsigned Src, Dst, SrcSub, DstSub;
-  if (!isMoveInstr(tri_, MI, Src, Dst, SrcSub, DstSub))
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
     return false;
 
-  // Find the virtual register that is srcReg_.
-  if (Dst == srcReg_) {
+  // Find the virtual register that is SrcReg.
+  if (Dst == SrcReg) {
     std::swap(Src, Dst);
     std::swap(SrcSub, DstSub);
-  } else if (Src != srcReg_) {
+  } else if (Src != SrcReg) {
     return false;
   }
 
-  // Now check that Dst matches dstReg_.
-  if (TargetRegisterInfo::isPhysicalRegister(dstReg_)) {
+  // Now check that Dst matches DstReg.
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
     if (!TargetRegisterInfo::isPhysicalRegister(Dst))
       return false;
-    assert(!subIdx_ && "Inconsistent CoalescerPair state.");
+    assert(!SubIdx && "Inconsistent CoalescerPair state.");
     // DstSub could be set for a physreg from INSERT_SUBREG.
     if (DstSub)
-      Dst = tri_.getSubReg(Dst, DstSub);
+      Dst = TRI.getSubReg(Dst, DstSub);
     // Full copy of Src.
     if (!SrcSub)
-      return dstReg_ == Dst;
+      return DstReg == Dst;
     // This is a partial register copy. Check that the parts match.
-    return tri_.getSubReg(dstReg_, SrcSub) == Dst;
+    return TRI.getSubReg(DstReg, SrcSub) == Dst;
   } else {
-    // dstReg_ is virtual.
-    if (dstReg_ != Dst)
+    // DstReg is virtual.
+    if (DstReg != Dst)
       return false;
     // Registers match, do the subregisters line up?
-    return compose(tri_, subIdx_, SrcSub) == DstSub;
+    return compose(TRI, SubIdx, SrcSub) == DstSub;
   }
 }
 
@@ -292,14 +416,14 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
                                                     MachineInstr *CopyMI) {
   // Bail if there is no dst interval - can happen when merging physical subreg
   // operations.
-  if (!li_->hasInterval(CP.getDstReg()))
+  if (!LIS->hasInterval(CP.getDstReg()))
     return false;
 
   LiveInterval &IntA =
-    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
   LiveInterval &IntB =
-    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
-  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
+    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getDefIndex();
 
   // BValNo is a value number in B that is defined by a copy from A.  'B3' in
   // the example above.
@@ -355,7 +479,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // Make sure that the end of the live range is inside the same block as
   // CopyMI.
   MachineInstr *ValLREndInst =
-    li_->getInstructionFromIndex(ValLR->end.getPrevSlot());
+    LIS->getInstructionFromIndex(ValLR->end.getPrevSlot());
   if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
     return false;
 
@@ -368,11 +492,11 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // of its aliases is overlapping the live interval of the virtual register.
   // If so, do not coalesce.
   if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
-    for (const unsigned *AS = tri_->getAliasSet(IntB.reg); *AS; ++AS)
-      if (li_->hasInterval(*AS) && IntA.overlaps(li_->getInterval(*AS))) {
+    for (const unsigned *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS)
+      if (LIS->hasInterval(*AS) && IntA.overlaps(LIS->getInterval(*AS))) {
         DEBUG({
             dbgs() << "\t\tInterfere with alias ";
-            li_->getInterval(*AS).print(dbgs(), tri_);
+            LIS->getInterval(*AS).print(dbgs(), TRI);
           });
         return false;
       }
@@ -380,7 +504,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
 
   DEBUG({
       dbgs() << "Extending: ";
-      IntB.print(dbgs(), tri_);
+      IntB.print(dbgs(), TRI);
     });
 
   SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
@@ -398,13 +522,13 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // If the IntB live range is assigned to a physical register, and if that
   // physreg has sub-registers, update their live intervals as well.
   if (TargetRegisterInfo::isPhysicalRegister(IntB.reg)) {
-    for (const unsigned *SR = tri_->getSubRegisters(IntB.reg); *SR; ++SR) {
-      if (!li_->hasInterval(*SR))
+    for (const unsigned *SR = TRI->getSubRegisters(IntB.reg); *SR; ++SR) {
+      if (!LIS->hasInterval(*SR))
         continue;
-      LiveInterval &SRLI = li_->getInterval(*SR);
+      LiveInterval &SRLI = LIS->getInterval(*SR);
       SRLI.addRange(LiveRange(FillerStart, FillerEnd,
                               SRLI.getNextValue(FillerStart, 0,
-                                                li_->getVNInfoAllocator())));
+                                                LIS->getVNInfoAllocator())));
     }
   }
 
@@ -419,7 +543,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   }
   DEBUG({
       dbgs() << "   result = ";
-      IntB.print(dbgs(), tri_);
+      IntB.print(dbgs(), TRI);
       dbgs() << "\n";
     });
 
@@ -434,7 +558,7 @@ bool RegisterCoalescer::AdjustCopiesBackFrom(const CoalescerPair &CP,
   // merge, find the last use and trim the live range. That will also add the
   // isKill marker.
   if (ALR->end == CopyIdx)
-    li_->shrinkToUses(&IntA);
+    LIS->shrinkToUses(&IntA);
 
   ++numExtends;
   return true;
@@ -498,15 +622,15 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
     return false;
 
   // Bail if there is no dst interval.
-  if (!li_->hasInterval(CP.getDstReg()))
+  if (!LIS->hasInterval(CP.getDstReg()))
     return false;
 
-  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getDefIndex();
 
   LiveInterval &IntA =
-    li_->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
   LiveInterval &IntB =
-    li_->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
 
   // BValNo is a value number in B that is defined by a copy from A. 'B3' in
   // the example above.
@@ -524,7 +648,7 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   // the optimization.
   if (AValNo->isPHIDef() || AValNo->isUnused() || AValNo->hasPHIKill())
     return false;
-  MachineInstr *DefMI = li_->getInstructionFromIndex(AValNo->def);
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
   if (!DefMI)
     return false;
   const MCInstrDesc &MCID = DefMI->getDesc();
@@ -538,7 +662,7 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
     return false;
   unsigned Op1, Op2, NewDstIdx;
-  if (!tii_->findCommutedOpIndices(DefMI, Op1, Op2))
+  if (!TII->findCommutedOpIndices(DefMI, Op1, Op2))
     return false;
   if (Op1 == UseOpIdx)
     NewDstIdx = Op2;
@@ -560,18 +684,18 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   // Abort if the aliases of IntB.reg have values that are not simply the
   // clobbers from the superreg.
   if (TargetRegisterInfo::isPhysicalRegister(IntB.reg))
-    for (const unsigned *AS = tri_->getAliasSet(IntB.reg); *AS; ++AS)
-      if (li_->hasInterval(*AS) &&
-          HasOtherReachingDefs(IntA, li_->getInterval(*AS), AValNo, 0))
+    for (const unsigned *AS = TRI->getAliasSet(IntB.reg); *AS; ++AS)
+      if (LIS->hasInterval(*AS) &&
+          HasOtherReachingDefs(IntA, LIS->getInterval(*AS), AValNo, 0))
         return false;
 
   // If some of the uses of IntA.reg is already coalesced away, return false.
   // It's not possible to determine whether it's safe to perform the coalescing.
-  for (MachineRegisterInfo::use_nodbg_iterator UI = 
-         mri_->use_nodbg_begin(IntA.reg), 
-       UE = mri_->use_nodbg_end(); UI != UE; ++UI) {
+  for (MachineRegisterInfo::use_nodbg_iterator UI =
+         MRI->use_nodbg_begin(IntA.reg),
+       UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
     MachineInstr *UseMI = &*UI;
-    SlotIndex UseIdx = li_->getInstructionIndex(UseMI);
+    SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
     if (ULR == IntA.end())
       continue;
@@ -585,15 +709,15 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   // At this point we have decided that it is legal to do this
   // transformation.  Start by commuting the instruction.
   MachineBasicBlock *MBB = DefMI->getParent();
-  MachineInstr *NewMI = tii_->commuteInstruction(DefMI);
+  MachineInstr *NewMI = TII->commuteInstruction(DefMI);
   if (!NewMI)
     return false;
   if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
       TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
-      !mri_->constrainRegClass(IntB.reg, mri_->getRegClass(IntA.reg)))
+      !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
     return false;
   if (NewMI != DefMI) {
-    li_->ReplaceMachineInstrInMaps(DefMI, NewMI);
+    LIS->ReplaceMachineInstrInMaps(DefMI, NewMI);
     MBB->insert(DefMI, NewMI);
     MBB->erase(DefMI);
   }
@@ -610,8 +734,8 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
   //   = B
 
   // Update uses of IntA of the specific Val# with IntB.
-  for (MachineRegisterInfo::use_iterator UI = mri_->use_begin(IntA.reg),
-         UE = mri_->use_end(); UI != UE;) {
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
+         UE = MRI->use_end(); UI != UE;) {
     MachineOperand &UseMO = UI.getOperand();
     MachineInstr *UseMI = &*UI;
     ++UI;
@@ -623,12 +747,12 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
       UseMO.setReg(NewReg);
       continue;
     }
-    SlotIndex UseIdx = li_->getInstructionIndex(UseMI).getUseIndex();
+    SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getUseIndex();
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
     if (ULR == IntA.end() || ULR->valno != AValNo)
       continue;
     if (TargetRegisterInfo::isPhysicalRegister(NewReg))
-      UseMO.substPhysReg(NewReg, *tri_);
+      UseMO.substPhysReg(NewReg, *TRI);
     else
       UseMO.setReg(NewReg);
     if (UseMI == CopyMI)
@@ -674,27 +798,24 @@ bool RegisterCoalescer::RemoveCopyByCommutingDef(const CoalescerPair &CP,
 bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
                                                        bool preserveSrcInt,
                                                        unsigned DstReg,
-                                                       unsigned DstSubIdx,
                                                        MachineInstr *CopyMI) {
-  SlotIndex CopyIdx = li_->getInstructionIndex(CopyMI).getUseIndex();
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getUseIndex();
   LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
   assert(SrcLR != SrcInt.end() && "Live range not found!");
   VNInfo *ValNo = SrcLR->valno;
-  // If other defs can reach uses of this def, then it's not safe to perform
-  // the optimization.
-  if (ValNo->isPHIDef() || ValNo->isUnused() || ValNo->hasPHIKill())
+  if (ValNo->isPHIDef() || ValNo->isUnused())
     return false;
-  MachineInstr *DefMI = li_->getInstructionFromIndex(ValNo->def);
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(ValNo->def);
   if (!DefMI)
     return false;
   assert(DefMI && "Defining instruction disappeared");
   const MCInstrDesc &MCID = DefMI->getDesc();
   if (!MCID.isAsCheapAsAMove())
     return false;
-  if (!tii_->isTriviallyReMaterializable(DefMI, AA))
+  if (!TII->isTriviallyReMaterializable(DefMI, AA))
     return false;
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(tii_, AA, SawStore))
+  if (!DefMI->isSafeToMove(TII, AA, SawStore))
     return false;
   if (MCID.getNumDefs() != 1)
     return false;
@@ -702,36 +823,20 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
     // Make sure the copy destination register class fits the instruction
     // definition register class. The mismatch can happen as a result of earlier
     // extract_subreg, insert_subreg, subreg_to_reg coalescing.
-    const TargetRegisterClass *RC = tii_->getRegClass(MCID, 0, tri_);
+    const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI);
     if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
-      if (mri_->getRegClass(DstReg) != RC)
+      if (MRI->getRegClass(DstReg) != RC)
         return false;
     } else if (!RC->contains(DstReg))
       return false;
   }
 
-  // If destination register has a sub-register index on it, make sure it
-  // matches the instruction register class.
-  if (DstSubIdx) {
-    const MCInstrDesc &MCID = DefMI->getDesc();
-    if (MCID.getNumDefs() != 1)
-      return false;
-    const TargetRegisterClass *DstRC = mri_->getRegClass(DstReg);
-    const TargetRegisterClass *DstSubRC =
-      DstRC->getSubRegisterRegClass(DstSubIdx);
-    const TargetRegisterClass *DefRC = tii_->getRegClass(MCID, 0, tri_);
-    if (DefRC == DstRC)
-      DstSubIdx = 0;
-    else if (DefRC != DstSubRC)
-      return false;
-  }
-
   RemoveCopyFlag(DstReg, CopyMI);
 
   MachineBasicBlock *MBB = CopyMI->getParent();
   MachineBasicBlock::iterator MII =
     llvm::next(MachineBasicBlock::iterator(CopyMI));
-  tii_->reMaterialize(*MBB, MII, DstReg, DstSubIdx, DefMI, *tri_);
+  TII->reMaterialize(*MBB, MII, DstReg, 0, DefMI, *TRI);
   MachineInstr *NewMI = prior(MII);
 
   // CopyMI may have implicit operands, transfer them over to the newly
@@ -746,7 +851,7 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
   }
 
   NewMI->copyImplicitOps(CopyMI);
-  li_->ReplaceMachineInstrInMaps(CopyMI, NewMI);
+  LIS->ReplaceMachineInstrInMaps(CopyMI, NewMI);
   CopyMI->eraseFromParent();
   ReMatCopies.insert(CopyMI);
   ReMatDefs.insert(DefMI);
@@ -755,8 +860,51 @@ bool RegisterCoalescer::ReMaterializeTrivialDef(LiveInterval &SrcInt,
 
   // The source interval can become smaller because we removed a use.
   if (preserveSrcInt)
-    li_->shrinkToUses(&SrcInt);
+    LIS->shrinkToUses(&SrcInt);
+
+  return true;
+}
+
+/// eliminateUndefCopy - ProcessImpicitDefs may leave some copies of <undef>
+/// values, it only removes local variables. When we have a copy like:
+///
+///   %vreg1 = COPY %vreg2<undef>
+///
+/// We delete the copy and remove the corresponding value number from %vreg1.
+/// Any uses of that value number are marked as <undef>.
+bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
+                                           const CoalescerPair &CP) {
+  SlotIndex Idx = LIS->getInstructionIndex(CopyMI);
+  LiveInterval *SrcInt = &LIS->getInterval(CP.getSrcReg());
+  if (SrcInt->liveAt(Idx))
+    return false;
+  LiveInterval *DstInt = &LIS->getInterval(CP.getDstReg());
+  if (DstInt->liveAt(Idx))
+    return false;
 
+  // No intervals are live-in to CopyMI - it is undef.
+  if (CP.isFlipped())
+    DstInt = SrcInt;
+  SrcInt = 0;
+
+  VNInfo *DeadVNI = DstInt->getVNInfoAt(Idx.getDefIndex());
+  assert(DeadVNI && "No value defined in DstInt");
+  DstInt->removeValNo(DeadVNI);
+
+  // Find new undef uses.
+  for (MachineRegisterInfo::reg_nodbg_iterator
+         I = MRI->reg_nodbg_begin(DstInt->reg), E = MRI->reg_nodbg_end();
+       I != E; ++I) {
+    MachineOperand &MO = I.getOperand();
+    if (MO.isDef() || MO.isUndef())
+      continue;
+    MachineInstr *MI = MO.getParent();
+    SlotIndex Idx = LIS->getInstructionIndex(MI);
+    if (DstInt->liveAt(Idx))
+      continue;
+    MO.setIsUndef(true);
+    DEBUG(dbgs() << "\tnew undef: " << Idx << '\t' << *MI);
+  }
   return true;
 }
 
@@ -773,22 +921,20 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
   unsigned SubIdx = CP.getSubIdx();
 
   // Update LiveDebugVariables.
-  ldv_->renameRegister(SrcReg, DstReg, SubIdx);
+  LDV->renameRegister(SrcReg, DstReg, SubIdx);
 
-  for (MachineRegisterInfo::reg_iterator I = mri_->reg_begin(SrcReg);
+  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg);
        MachineInstr *UseMI = I.skipInstruction();) {
     // A PhysReg copy that won't be coalesced can perhaps be rematerialized
     // instead.
     if (DstIsPhys) {
-      if (UseMI->isCopy() &&
-          !UseMI->getOperand(1).getSubReg() &&
-          !UseMI->getOperand(0).getSubReg() &&
+      if (UseMI->isFullCopy() &&
           UseMI->getOperand(1).getReg() == SrcReg &&
           UseMI->getOperand(0).getReg() != SrcReg &&
           UseMI->getOperand(0).getReg() != DstReg &&
           !JoinedCopies.count(UseMI) &&
-          ReMaterializeTrivialDef(li_->getInterval(SrcReg), false,
-                                  UseMI->getOperand(0).getReg(), 0, UseMI))
+          ReMaterializeTrivialDef(LIS->getInterval(SrcReg), false,
+                                  UseMI->getOperand(0).getReg(), UseMI))
         continue;
     }
 
@@ -803,10 +949,18 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
       Kills |= MO.isKill();
       Deads |= MO.isDead();
 
+      // Make sure we don't create read-modify-write defs accidentally.  We
+      // assume here that a SrcReg def cannot be joined into a live DstReg.  If
+      // RegisterCoalescer starts tracking partially live registers, we will
+      // need to check the actual LiveInterval to determine if DstReg is live
+      // here.
+      if (SubIdx && !Reads)
+        MO.setIsUndef();
+
       if (DstIsPhys)
-        MO.substPhysReg(DstReg, *tri_);
+        MO.substPhysReg(DstReg, *TRI);
       else
-        MO.substVirtReg(DstReg, SubIdx, *tri_);
+        MO.substVirtReg(DstReg, SubIdx, *TRI);
     }
 
     // This instruction is a copy that will be removed.
@@ -817,19 +971,19 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
       // If UseMI was a simple SrcReg def, make sure we didn't turn it into a
       // read-modify-write of DstReg.
       if (Deads)
-        UseMI->addRegisterDead(DstReg, tri_);
+        UseMI->addRegisterDead(DstReg, TRI);
       else if (!Reads && Writes)
-        UseMI->addRegisterDefined(DstReg, tri_);
+        UseMI->addRegisterDefined(DstReg, TRI);
 
       // Kill flags apply to the whole physical register.
       if (DstIsPhys && Kills)
-        UseMI->addRegisterKilled(DstReg, tri_);
+        UseMI->addRegisterKilled(DstReg, TRI);
     }
 
     DEBUG({
         dbgs() << "\t\tupdated: ";
         if (!UseMI->isDebugValue())
-          dbgs() << li_->getInstructionIndex(UseMI) << "\t";
+          dbgs() << LIS->getInstructionIndex(UseMI) << "\t";
         dbgs() << *UseMI;
       });
   }
@@ -838,18 +992,18 @@ RegisterCoalescer::UpdateRegDefsUses(const CoalescerPair &CP) {
 /// removeIntervalIfEmpty - Check if the live interval of a physical register
 /// is empty, if so remove it and also remove the empty intervals of its
 /// sub-registers. Return true if live interval is removed.
-static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *li_,
-                                  const TargetRegisterInfo *tri_) {
+static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *LIS,
+                                  const TargetRegisterInfo *TRI) {
   if (li.empty()) {
     if (TargetRegisterInfo::isPhysicalRegister(li.reg))
-      for (const unsigned* SR = tri_->getSubRegisters(li.reg); *SR; ++SR) {
-        if (!li_->hasInterval(*SR))
+      for (const unsigned* SR = TRI->getSubRegisters(li.reg); *SR; ++SR) {
+        if (!LIS->hasInterval(*SR))
           continue;
-        LiveInterval &sli = li_->getInterval(*SR);
+        LiveInterval &sli = LIS->getInterval(*SR);
         if (sli.empty())
-          li_->removeInterval(*SR);
+          LIS->removeInterval(*SR);
       }
-    li_->removeInterval(li.reg);
+    LIS->removeInterval(li.reg);
     return true;
   }
   return false;
@@ -859,29 +1013,29 @@ static bool removeIntervalIfEmpty(LiveInterval &li, LiveIntervals *li_,
 /// the val# it defines. If the live interval becomes empty, remove it as well.
 bool RegisterCoalescer::RemoveDeadDef(LiveInterval &li,
                                              MachineInstr *DefMI) {
-  SlotIndex DefIdx = li_->getInstructionIndex(DefMI).getDefIndex();
+  SlotIndex DefIdx = LIS->getInstructionIndex(DefMI).getDefIndex();
   LiveInterval::iterator MLR = li.FindLiveRangeContaining(DefIdx);
   if (DefIdx != MLR->valno->def)
     return false;
   li.removeValNo(MLR->valno);
-  return removeIntervalIfEmpty(li, li_, tri_);
+  return removeIntervalIfEmpty(li, LIS, TRI);
 }
 
 void RegisterCoalescer::RemoveCopyFlag(unsigned DstReg,
                                               const MachineInstr *CopyMI) {
-  SlotIndex DefIdx = li_->getInstructionIndex(CopyMI).getDefIndex();
-  if (li_->hasInterval(DstReg)) {
-    LiveInterval &LI = li_->getInterval(DstReg);
+  SlotIndex DefIdx = LIS->getInstructionIndex(CopyMI).getDefIndex();
+  if (LIS->hasInterval(DstReg)) {
+    LiveInterval &LI = LIS->getInterval(DstReg);
     if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
       if (LR->valno->def == DefIdx)
         LR->valno->setCopy(0);
   }
   if (!TargetRegisterInfo::isPhysicalRegister(DstReg))
     return;
-  for (const unsigned* AS = tri_->getAliasSet(DstReg); *AS; ++AS) {
-    if (!li_->hasInterval(*AS))
+  for (const unsigned* AS = TRI->getAliasSet(DstReg); *AS; ++AS) {
+    if (!LIS->hasInterval(*AS))
       continue;
-    LiveInterval &LI = li_->getInterval(*AS);
+    LiveInterval &LI = LIS->getInterval(*AS);
     if (const LiveRange *LR = LI.getLiveRangeContaining(DefIdx))
       if (LR->valno->def == DefIdx)
         LR->valno->setCopy(0);
@@ -894,8 +1048,8 @@ void RegisterCoalescer::RemoveCopyFlag(unsigned DstReg,
 /// are not spillable! If the destination interval uses are far away, think
 /// twice about coalescing them!
 bool RegisterCoalescer::shouldJoinPhys(CoalescerPair &CP) {
-  bool Allocatable = li_->isAllocatable(CP.getDstReg());
-  LiveInterval &JoinVInt = li_->getInterval(CP.getSrcReg());
+  bool Allocatable = LIS->isAllocatable(CP.getDstReg());
+  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
 
   /// Always join simple intervals that are defined by a single copy from a
   /// reserved register. This doesn't increase register pressure, so it is
@@ -918,8 +1072,8 @@ bool RegisterCoalescer::shouldJoinPhys(CoalescerPair &CP) {
   // Don't join with physregs that have a ridiculous number of live
   // ranges. The data structure performance is really bad when that
   // happens.
-  if (li_->hasInterval(CP.getDstReg()) &&
-      li_->getInterval(CP.getDstReg()).ranges.size() > 1000) {
+  if (LIS->hasInterval(CP.getDstReg()) &&
+      LIS->getInterval(CP.getDstReg()).ranges.size() > 1000) {
     ++numAborts;
     DEBUG(dbgs()
           << "\tPhysical register live interval too complicated, abort!\n");
@@ -929,9 +1083,9 @@ bool RegisterCoalescer::shouldJoinPhys(CoalescerPair &CP) {
   // FIXME: Why are we skipping this test for partial copies?
   //        CodeGen/X86/phys_subreg_coalesce-3.ll needs it.
   if (!CP.isPartial()) {
-    const TargetRegisterClass *RC = mri_->getRegClass(CP.getSrcReg());
+    const TargetRegisterClass *RC = MRI->getRegClass(CP.getSrcReg());
     unsigned Threshold = RegClassInfo.getNumAllocatableRegs(RC) * 2;
-    unsigned Length = li_->getApproximateInstructionCount(JoinVInt);
+    unsigned Length = LIS->getApproximateInstructionCount(JoinVInt);
     if (Length > Threshold) {
       ++numAborts;
       DEBUG(dbgs() << "\tMay tie down a physical register, abort!\n");
@@ -957,12 +1111,12 @@ RegisterCoalescer::isWinToJoinCrossClass(unsigned SrcReg,
       // Early exit if the function is fairly small, coalesce aggressively if
       // that's the case. For really special register classes with 3 or
       // fewer registers, be a bit more careful.
-      (li_->getFuncInstructionCount() / NewRCCount) < 8)
+      (LIS->getFuncInstructionCount() / NewRCCount) < 8)
     return true;
-  LiveInterval &SrcInt = li_->getInterval(SrcReg);
-  LiveInterval &DstInt = li_->getInterval(DstReg);
-  unsigned SrcSize = li_->getApproximateInstructionCount(SrcInt);
-  unsigned DstSize = li_->getApproximateInstructionCount(DstInt);
+  LiveInterval &SrcInt = LIS->getInterval(SrcReg);
+  LiveInterval &DstInt = LIS->getInterval(DstReg);
+  unsigned SrcSize = LIS->getApproximateInstructionCount(SrcInt);
+  unsigned DstSize = LIS->getApproximateInstructionCount(DstInt);
 
   // Coalesce aggressively if the intervals are small compared to the number of
   // registers in the new class. The number 4 is fairly arbitrary, chosen to be
@@ -972,10 +1126,10 @@ RegisterCoalescer::isWinToJoinCrossClass(unsigned SrcReg,
     return true;
 
   // Estimate *register use density*. If it doubles or more, abort.
-  unsigned SrcUses = std::distance(mri_->use_nodbg_begin(SrcReg),
-                                   mri_->use_nodbg_end());
-  unsigned DstUses = std::distance(mri_->use_nodbg_begin(DstReg),
-                                   mri_->use_nodbg_end());
+  unsigned SrcUses = std::distance(MRI->use_nodbg_begin(SrcReg),
+                                   MRI->use_nodbg_end());
+  unsigned DstUses = std::distance(MRI->use_nodbg_begin(DstReg),
+                                   MRI->use_nodbg_end());
   unsigned NewUses = SrcUses + DstUses;
   unsigned NewSize = SrcSize + DstSize;
   if (SrcRC != NewRC && SrcSize > ThresSize) {
@@ -1003,9 +1157,9 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
   if (JoinedCopies.count(CopyMI) || ReMatCopies.count(CopyMI))
     return false; // Already done.
 
-  DEBUG(dbgs() << li_->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
+  DEBUG(dbgs() << LIS->getInstructionIndex(CopyMI) << '\t' << *CopyMI);
 
-  CoalescerPair CP(*tii_, *tri_);
+  CoalescerPair CP(*TII, *TRI);
   if (!CP.setRegisters(CopyMI)) {
     DEBUG(dbgs() << "\tNot coalescable.\n");
     return false;
@@ -1018,8 +1172,15 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
     return false;  // Not coalescable.
   }
 
-  DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), tri_)
-               << " with " << PrintReg(CP.getDstReg(), tri_, CP.getSubIdx())
+  // Eliminate undefs.
+  if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) {
+    markAsJoined(CopyMI);
+    DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n");
+    return false;  // Not coalescable.
+  }
+
+  DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
+               << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSubIdx())
                << "\n");
 
   // Enforce policies.
@@ -1028,8 +1189,8 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
       // Before giving up coalescing, if definition of source is defined by
       // trivial computation, try rematerializing it.
       if (!CP.isFlipped() &&
-          ReMaterializeTrivialDef(li_->getInterval(CP.getSrcReg()), true,
-                                  CP.getDstReg(), 0, CopyMI))
+          ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true,
+                                  CP.getDstReg(), CopyMI))
         return true;
       return false;
     }
@@ -1042,8 +1203,8 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
         return false;
       }
       if (!isWinToJoinCrossClass(CP.getSrcReg(), CP.getDstReg(),
-                                 mri_->getRegClass(CP.getSrcReg()),
-                                 mri_->getRegClass(CP.getDstReg()),
+                                 MRI->getRegClass(CP.getSrcReg()),
+                                 MRI->getRegClass(CP.getDstReg()),
                                  CP.getNewRC())) {
         DEBUG(dbgs() << "\tAvoid coalescing to constrained register class.\n");
         Again = true;  // May be possible to coalesce later.
@@ -1052,8 +1213,8 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
     }
 
     // When possible, let DstReg be the larger interval.
-    if (!CP.getSubIdx() && li_->getInterval(CP.getSrcReg()).ranges.size() >
-                           li_->getInterval(CP.getDstReg()).ranges.size())
+    if (!CP.getSubIdx() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
+                           LIS->getInterval(CP.getDstReg()).ranges.size())
       CP.flip();
   }
 
@@ -1067,8 +1228,8 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
     if (!CP.isFlipped() &&
-        ReMaterializeTrivialDef(li_->getInterval(CP.getSrcReg()), true,
-                                CP.getDstReg(), 0, CopyMI))
+        ReMaterializeTrivialDef(LIS->getInterval(CP.getSrcReg()), true,
+                                CP.getDstReg(), CopyMI))
       return true;
 
     // If we can eliminate the copy without merging the live ranges, do so now.
@@ -1091,7 +1252,7 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
   // other. Make sure the resulting register is set to the right register class.
   if (CP.isCrossClass()) {
     ++numCrossRCs;
-    mri_->setRegClass(CP.getDstReg(), CP.getNewRC());
+    MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
   }
 
   // Remember to delete the copy instruction.
@@ -1105,10 +1266,10 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
     SmallVector<MachineBasicBlock*, 16> BlockSeq;
     // JoinIntervals invalidates the VNInfos in SrcInt, but we only need the
     // ranges for this, and they are preserved.
-    LiveInterval &SrcInt = li_->getInterval(CP.getSrcReg());
+    LiveInterval &SrcInt = LIS->getInterval(CP.getSrcReg());
     for (LiveInterval::const_iterator I = SrcInt.begin(), E = SrcInt.end();
          I != E; ++I ) {
-      li_->findLiveInMBBs(I->start, I->end, BlockSeq);
+      LIS->findLiveInMBBs(I->start, I->end, BlockSeq);
       for (unsigned idx = 0, size = BlockSeq.size(); idx != size; ++idx) {
         MachineBasicBlock &block = *BlockSeq[idx];
         if (!block.isLiveIn(CP.getDstReg()))
@@ -1120,15 +1281,15 @@ bool RegisterCoalescer::JoinCopy(MachineInstr *CopyMI, bool &Again) {
 
   // SrcReg is guarateed to be the register whose live interval that is
   // being merged.
-  li_->removeInterval(CP.getSrcReg());
+  LIS->removeInterval(CP.getSrcReg());
 
   // Update regalloc hint.
-  tri_->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *mf_);
+  TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    LiveInterval &DstInt = li_->getInterval(CP.getDstReg());
+    LiveInterval &DstInt = LIS->getInterval(CP.getDstReg());
     dbgs() << "\tJoined. Result = ";
-    DstInt.print(dbgs(), tri_);
+    DstInt.print(dbgs(), TRI);
     dbgs() << "\n";
   });
 
@@ -1197,6 +1358,7 @@ static unsigned ComputeUltimateVN(VNInfo *VNI,
 // which allows us to coalesce A and B.
 // VNI is the definition of B. LR is the life range of A that includes
 // the slot just before B. If we return true, we add "B = X" to DupCopies.
+// This implies that A dominates B.
 static bool RegistersDefinedFromSameValue(LiveIntervals &li,
                                           const TargetRegisterInfo &tri,
                                           CoalescerPair &CP,
@@ -1248,7 +1410,9 @@ static bool RegistersDefinedFromSameValue(LiveIntervals &li,
   // If the copies use two different value numbers of X, we cannot merge
   // A and B.
   LiveInterval &SrcInt = li.getInterval(Src);
-  if (SrcInt.getVNInfoAt(Other->def) != SrcInt.getVNInfoAt(VNI->def))
+  // getVNInfoBefore returns NULL for undef copies. In this case, the
+  // optimization is still safe.
+  if (SrcInt.getVNInfoBefore(Other->def) != SrcInt.getVNInfoBefore(VNI->def))
     return false;
 
   DupCopies.push_back(MI);
@@ -1259,18 +1423,18 @@ static bool RegistersDefinedFromSameValue(LiveIntervals &li,
 /// JoinIntervals - Attempt to join these two intervals.  On failure, this
 /// returns false.
 bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
-  LiveInterval &RHS = li_->getInterval(CP.getSrcReg());
-  DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), tri_); dbgs() << "\n"; });
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  DEBUG({ dbgs() << "\t\tRHS = "; RHS.print(dbgs(), TRI); dbgs() << "\n"; });
 
   // If a live interval is a physical register, check for interference with any
   // aliases. The interference check implemented here is a bit more conservative
   // than the full interfeence check below. We allow overlapping live ranges
   // only when one is a copy of the other.
   if (CP.isPhys()) {
-    for (const unsigned *AS = tri_->getAliasSet(CP.getDstReg()); *AS; ++AS){
-      if (!li_->hasInterval(*AS))
+    for (const unsigned *AS = TRI->getAliasSet(CP.getDstReg()); *AS; ++AS){
+      if (!LIS->hasInterval(*AS))
         continue;
-      const LiveInterval &LHS = li_->getInterval(*AS);
+      const LiveInterval &LHS = LIS->getInterval(*AS);
       LiveInterval::const_iterator LI = LHS.begin();
       for (LiveInterval::const_iterator RI = RHS.begin(), RE = RHS.end();
            RI != RE; ++RI) {
@@ -1278,10 +1442,10 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
         // Does LHS have an overlapping live range starting before RI?
         if ((LI != LHS.begin() && LI[-1].end > RI->start) &&
             (RI->start != RI->valno->def ||
-             !CP.isCoalescable(li_->getInstructionFromIndex(RI->start)))) {
+             !CP.isCoalescable(LIS->getInstructionFromIndex(RI->start)))) {
           DEBUG({
             dbgs() << "\t\tInterference from alias: ";
-            LHS.print(dbgs(), tri_);
+            LHS.print(dbgs(), TRI);
             dbgs() << "\n\t\tOverlap at " << RI->start << " and no copy.\n";
           });
           return false;
@@ -1290,10 +1454,10 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
         // Check that LHS ranges beginning in this range are copies.
         for (; LI != LHS.end() && LI->start < RI->end; ++LI) {
           if (LI->start != LI->valno->def ||
-              !CP.isCoalescable(li_->getInstructionFromIndex(LI->start))) {
+              !CP.isCoalescable(LIS->getInstructionFromIndex(LI->start))) {
             DEBUG({
               dbgs() << "\t\tInterference from alias: ";
-              LHS.print(dbgs(), tri_);
+              LHS.print(dbgs(), TRI);
               dbgs() << "\n\t\tDef at " << LI->start << " is not a copy.\n";
             });
             return false;
@@ -1313,8 +1477,8 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
 
   SmallVector<MachineInstr*, 8> DupCopies;
 
-  LiveInterval &LHS = li_->getOrCreateInterval(CP.getDstReg());
-  DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), tri_); dbgs() << "\n"; });
+  LiveInterval &LHS = LIS->getOrCreateInterval(CP.getDstReg());
+  DEBUG({ dbgs() << "\t\tLHS = "; LHS.print(dbgs(), TRI); dbgs() << "\n"; });
 
   // Loop over the value numbers of the LHS, seeing if any are defined from
   // the RHS.
@@ -1337,7 +1501,7 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
     // from the RHS interval, we can use its value #.
     MachineInstr *MI = VNI->getCopy();
     if (!CP.isCoalescable(MI) &&
-        !RegistersDefinedFromSameValue(*li_, *tri_, CP, VNI, lr, DupCopies))
+        !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies))
       continue;
 
     LHSValsDefinedFromRHS[VNI] = lr->valno;
@@ -1364,7 +1528,7 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
     // from the LHS interval, we can use its value #.
     MachineInstr *MI = VNI->getCopy();
     if (!CP.isCoalescable(MI) &&
-        !RegistersDefinedFromSameValue(*li_, *tri_, CP, VNI, lr, DupCopies))
+        !RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, lr, DupCopies))
         continue;
 
     RHSValsDefinedFromLHS[VNI] = lr->valno;
@@ -1486,7 +1650,7 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
     // and mark the X as coalesced to keep the illusion.
     unsigned Src = MI->getOperand(1).getReg();
     SourceRegisters.push_back(Src);
-    MI->getOperand(0).substVirtReg(Src, 0, *tri_);
+    MI->getOperand(0).substVirtReg(Src, 0, *TRI);
 
     markAsJoined(MI);
   }
@@ -1495,13 +1659,13 @@ bool RegisterCoalescer::JoinIntervals(CoalescerPair &CP) {
   // that B = X is gone.
   for (SmallVector<unsigned, 8>::iterator I = SourceRegisters.begin(),
          E = SourceRegisters.end(); I != E; ++I) {
-    li_->shrinkToUses(&li_->getInterval(*I));
+    LIS->shrinkToUses(&LIS->getInterval(*I));
   }
 
   // If we get here, we know that we can coalesce the live ranges.  Ask the
   // intervals to coalesce themselves now.
   LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo,
-           mri_);
+           MRI);
   return true;
 }
 
@@ -1552,7 +1716,7 @@ void RegisterCoalescer::CopyCoalesceInMBB(MachineBasicBlock *MBB,
 
     bool SrcIsPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
     bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
-    if (li_->hasInterval(SrcReg) && li_->getInterval(SrcReg).empty())
+    if (LIS->hasInterval(SrcReg) && LIS->getInterval(SrcReg).empty())
       ImpDefCopies.push_back(Inst);
     else if (SrcIsPhys || DstIsPhys)
       PhysCopies.push_back(Inst);
@@ -1590,9 +1754,9 @@ void RegisterCoalescer::joinIntervals() {
   DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
 
   std::vector<MachineInstr*> TryAgainList;
-  if (loopInfo->empty()) {
+  if (Loops->empty()) {
     // If there are no loops in the function, join intervals in function order.
-    for (MachineFunction::iterator I = mf_->begin(), E = mf_->end();
+    for (MachineFunction::iterator I = MF->begin(), E = MF->end();
          I != E; ++I)
       CopyCoalesceInMBB(I, TryAgainList);
   } else {
@@ -1603,9 +1767,9 @@ void RegisterCoalescer::joinIntervals() {
     // Join intervals in the function prolog first. We want to join physical
     // registers with virtual registers before the intervals got too long.
     std::vector<std::pair<unsigned, MachineBasicBlock*> > MBBs;
-    for (MachineFunction::iterator I = mf_->begin(), E = mf_->end();I != E;++I){
+    for (MachineFunction::iterator I = MF->begin(), E = MF->end();I != E;++I){
       MachineBasicBlock *MBB = I;
-      MBBs.push_back(std::make_pair(loopInfo->getLoopDepth(MBB), I));
+      MBBs.push_back(std::make_pair(Loops->getLoopDepth(MBB), I));
     }
 
     // Sort by loop depth.
@@ -1644,22 +1808,22 @@ void RegisterCoalescer::releaseMemory() {
 }
 
 bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
-  mf_ = &fn;
-  mri_ = &fn.getRegInfo();
-  tm_ = &fn.getTarget();
-  tri_ = tm_->getRegisterInfo();
-  tii_ = tm_->getInstrInfo();
-  li_ = &getAnalysis<LiveIntervals>();
-  ldv_ = &getAnalysis<LiveDebugVariables>();
+  MF = &fn;
+  MRI = &fn.getRegInfo();
+  TM = &fn.getTarget();
+  TRI = TM->getRegisterInfo();
+  TII = TM->getInstrInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  LDV = &getAnalysis<LiveDebugVariables>();
   AA = &getAnalysis<AliasAnalysis>();
-  loopInfo = &getAnalysis<MachineLoopInfo>();
+  Loops = &getAnalysis<MachineLoopInfo>();
 
   DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
                << "********** Function: "
-               << ((Value*)mf_->getFunction())->getName() << '\n');
+               << ((Value*)MF->getFunction())->getName() << '\n');
 
   if (VerifyCoalescing)
-    mf_->verify(this, "Before register coalescing");
+    MF->verify(this, "Before register coalescing");
 
   RegClassInfo.runOnMachineFunction(fn);
 
@@ -1668,9 +1832,9 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
     joinIntervals();
     DEBUG({
         dbgs() << "********** INTERVALS POST JOINING **********\n";
-        for (LiveIntervals::iterator I = li_->begin(), E = li_->end();
+        for (LiveIntervals::iterator I = LIS->begin(), E = LIS->end();
              I != E; ++I){
-          I->second->print(dbgs(), tri_);
+          I->second->print(dbgs(), TRI);
           dbgs() << "\n";
         }
       });
@@ -1678,8 +1842,8 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
 
   // Perform a final pass over the instructions and compute spill weights
   // and remove identity moves.
-  SmallVector<unsigned, 4> DeadDefs;
-  for (MachineFunction::iterator mbbi = mf_->begin(), mbbe = mf_->end();
+  SmallVector<unsigned, 4> DeadDefs, InflateRegs;
+  for (MachineFunction::iterator mbbi = MF->begin(), mbbe = MF->end();
        mbbi != mbbe; ++mbbi) {
     MachineBasicBlock* mbb = mbbi;
     for (MachineBasicBlock::iterator mii = mbb->begin(), mie = mbb->end();
@@ -1690,6 +1854,16 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
         bool DoDelete = true;
         assert(MI->isCopyLike() && "Unrecognized copy instruction");
         unsigned SrcReg = MI->getOperand(MI->isSubregToReg() ? 2 : 1).getReg();
+        unsigned DstReg = MI->getOperand(0).getReg();
+
+        // Collect candidates for register class inflation.
+        if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+            RegClassInfo.isProperSubClass(MRI->getRegClass(SrcReg)))
+          InflateRegs.push_back(SrcReg);
+        if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
+            RegClassInfo.isProperSubClass(MRI->getRegClass(DstReg)))
+          InflateRegs.push_back(DstReg);
+
         if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
             MI->getNumOperands() > 2)
           // Do not delete extract_subreg, insert_subreg of physical
@@ -1701,8 +1875,8 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
 
         if (MI->allDefsAreDead()) {
           if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-              li_->hasInterval(SrcReg))
-            li_->shrinkToUses(&li_->getInterval(SrcReg));
+              LIS->hasInterval(SrcReg))
+            LIS->shrinkToUses(&LIS->getInterval(SrcReg));
           DoDelete = true;
         }
         if (!DoDelete) {
@@ -1711,10 +1885,10 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
             MI->RemoveOperand(3);
             MI->RemoveOperand(1);
           }
-          MI->setDesc(tii_->get(TargetOpcode::KILL));
+          MI->setDesc(TII->get(TargetOpcode::KILL));
           mii = llvm::next(mii);
         } else {
-          li_->RemoveMachineInstrFromMaps(MI);
+          LIS->RemoveMachineInstrFromMaps(MI);
           mii = mbbi->erase(mii);
           ++numPeep;
         }
@@ -1731,12 +1905,16 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
           unsigned Reg = MO.getReg();
           if (!Reg)
             continue;
-          if (TargetRegisterInfo::isVirtualRegister(Reg))
+          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
             DeadDefs.push_back(Reg);
+            // Remat may also enable register class inflation.
+            if (RegClassInfo.isProperSubClass(MRI->getRegClass(Reg)))
+              InflateRegs.push_back(Reg);
+          }
           if (MO.isDead())
             continue;
           if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-              !mri_->use_nodbg_empty(Reg)) {
+              !MRI->use_nodbg_empty(Reg)) {
             isDead = false;
             break;
           }
@@ -1745,9 +1923,9 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
           while (!DeadDefs.empty()) {
             unsigned DeadDef = DeadDefs.back();
             DeadDefs.pop_back();
-            RemoveDeadDef(li_->getInterval(DeadDef), MI);
+            RemoveDeadDef(LIS->getInterval(DeadDef), MI);
           }
-          li_->RemoveMachineInstrFromMaps(mii);
+          LIS->RemoveMachineInstrFromMaps(mii);
           mii = mbbi->erase(mii);
           continue;
         } else
@@ -1757,14 +1935,14 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
       ++mii;
 
       // Check for now unnecessary kill flags.
-      if (li_->isNotInMIMap(MI)) continue;
-      SlotIndex DefIdx = li_->getInstructionIndex(MI).getDefIndex();
+      if (LIS->isNotInMIMap(MI)) continue;
+      SlotIndex DefIdx = LIS->getInstructionIndex(MI).getDefIndex();
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI->getOperand(i);
         if (!MO.isReg() || !MO.isKill()) continue;
         unsigned reg = MO.getReg();
-        if (!reg || !li_->hasInterval(reg)) continue;
-        if (!li_->getInterval(reg).killedAt(DefIdx)) {
+        if (!reg || !LIS->hasInterval(reg)) continue;
+        if (!LIS->getInterval(reg).killedAt(DefIdx)) {
           MO.setIsKill(false);
           continue;
         }
@@ -1772,26 +1950,40 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
         // remain alive.
         if (!TargetRegisterInfo::isPhysicalRegister(reg))
           continue;
-        for (const unsigned *SR = tri_->getSubRegisters(reg);
+        for (const unsigned *SR = TRI->getSubRegisters(reg);
              unsigned S = *SR; ++SR)
-          if (li_->hasInterval(S) && li_->getInterval(S).liveAt(DefIdx))
-            MI->addRegisterDefined(S, tri_);
+          if (LIS->hasInterval(S) && LIS->getInterval(S).liveAt(DefIdx))
+            MI->addRegisterDefined(S, TRI);
       }
     }
   }
 
+  // After deleting a lot of copies, register classes may be less constrained.
+  // Removing sub-register opreands may alow GR32_ABCD -> GR32 and DPR_VFP2 ->
+  // DPR inflation.
+  array_pod_sort(InflateRegs.begin(), InflateRegs.end());
+  InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
+                    InflateRegs.end());
+  DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size() << " regs.\n");
+  for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
+    unsigned Reg = InflateRegs[i];
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    if (MRI->recomputeRegClass(Reg, *TM)) {
+      DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
+                   << MRI->getRegClass(Reg)->getName() << '\n');
+      ++NumInflated;
+    }
+  }
+
   DEBUG(dump());
-  DEBUG(ldv_->dump());
+  DEBUG(LDV->dump());
   if (VerifyCoalescing)
-    mf_->verify(this, "After register coalescing");
+    MF->verify(this, "After register coalescing");
   return true;
 }
 
 /// print - Implement the dump method.
 void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
-   li_->print(O, m);
-}
-
-RegisterCoalescer *llvm::createRegisterCoalescer() {
-  return new RegisterCoalescer();
+   LIS->print(O, m);
 }
diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index 4131d91..472c483 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h
@@ -12,198 +12,60 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RegisterClassInfo.h"
-#include "llvm/Support/IncludeFile.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/ADT/SmallPtrSet.h"
-
 #ifndef LLVM_CODEGEN_REGISTER_COALESCER_H
 #define LLVM_CODEGEN_REGISTER_COALESCER_H
 
 namespace llvm {
 
-  class MachineFunction;
-  class RegallocQuery;
-  class AnalysisUsage;
   class MachineInstr;
   class TargetRegisterInfo;
   class TargetRegisterClass;
   class TargetInstrInfo;
-  class LiveDebugVariables;
-  class VirtRegMap;
-  class MachineLoopInfo;
-
-  class CoalescerPair;
-
-  /// An abstract interface for register coalescers.  Coalescers must
-  /// implement this interface to be part of the coalescer analysis
-  /// group.
-  class RegisterCoalescer : public MachineFunctionPass {
-    MachineFunction* mf_;
-    MachineRegisterInfo* mri_;
-    const TargetMachine* tm_;
-    const TargetRegisterInfo* tri_;
-    const TargetInstrInfo* tii_;
-    LiveIntervals *li_;
-    LiveDebugVariables *ldv_;
-    const MachineLoopInfo* loopInfo;
-    AliasAnalysis *AA;
-    RegisterClassInfo RegClassInfo;
-
-    /// JoinedCopies - Keep track of copies eliminated due to coalescing.
-    ///
-    SmallPtrSet<MachineInstr*, 32> JoinedCopies;
-
-    /// ReMatCopies - Keep track of copies eliminated due to remat.
-    ///
-    SmallPtrSet<MachineInstr*, 32> ReMatCopies;
-
-    /// ReMatDefs - Keep track of definition instructions which have
-    /// been remat'ed.
-    SmallPtrSet<MachineInstr*, 8> ReMatDefs;
-
-    /// joinIntervals - join compatible live intervals
-    void joinIntervals();
-
-    /// CopyCoalesceInMBB - Coalesce copies in the specified MBB, putting
-    /// copies that cannot yet be coalesced into the "TryAgain" list.
-    void CopyCoalesceInMBB(MachineBasicBlock *MBB,
-                           std::vector<MachineInstr*> &TryAgain);
-
-    /// JoinCopy - Attempt to join intervals corresponding to SrcReg/DstReg,
-    /// which are the src/dst of the copy instruction CopyMI.  This returns true
-    /// if the copy was successfully coalesced away. If it is not currently
-    /// possible to coalesce this interval, but it may be possible if other
-    /// things get coalesced, then it returns true by reference in 'Again'.
-    bool JoinCopy(MachineInstr *TheCopy, bool &Again);
-
-    /// JoinIntervals - Attempt to join these two intervals.  On failure, this
-    /// returns false.  The output "SrcInt" will not have been modified, so we can
-    /// use this information below to update aliases.
-    bool JoinIntervals(CoalescerPair &CP);
-
-    /// AdjustCopiesBackFrom - We found a non-trivially-coalescable copy. If
-    /// the source value number is defined by a copy from the destination reg
-    /// see if we can merge these two destination reg valno# into a single
-    /// value number, eliminating a copy.
-    bool AdjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
-
-    /// HasOtherReachingDefs - Return true if there are definitions of IntB
-    /// other than BValNo val# that can reach uses of AValno val# of IntA.
-    bool HasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
-                              VNInfo *AValNo, VNInfo *BValNo);
-
-    /// RemoveCopyByCommutingDef - We found a non-trivially-coalescable copy.
-    /// If the source value number is defined by a commutable instruction and
-    /// its other operand is coalesced to the copy dest register, see if we
-    /// can transform the copy into a noop by commuting the definition.
-    bool RemoveCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
-
-    /// ReMaterializeTrivialDef - If the source of a copy is defined by a trivial
-    /// computation, replace the copy by rematerialize the definition.
-    /// If PreserveSrcInt is true, make sure SrcInt is valid after the call.
-    bool ReMaterializeTrivialDef(LiveInterval &SrcInt, bool PreserveSrcInt,
-                                 unsigned DstReg, unsigned DstSubIdx,
-                                 MachineInstr *CopyMI);
-
-    /// shouldJoinPhys - Return true if a physreg copy should be joined.
-    bool shouldJoinPhys(CoalescerPair &CP);
-
-    /// isWinToJoinCrossClass - Return true if it's profitable to coalesce
-    /// two virtual registers from different register classes.
-    bool isWinToJoinCrossClass(unsigned SrcReg,
-                               unsigned DstReg,
-                               const TargetRegisterClass *SrcRC,
-                               const TargetRegisterClass *DstRC,
-                               const TargetRegisterClass *NewRC);
-
-    /// UpdateRegDefsUses - Replace all defs and uses of SrcReg to DstReg and
-    /// update the subregister number if it is not zero. If DstReg is a
-    /// physical register and the existing subregister number of the def / use
-    /// being updated is not zero, make sure to set it to the correct physical
-    /// subregister.
-    void UpdateRegDefsUses(const CoalescerPair &CP);
-
-    /// RemoveDeadDef - If a def of a live interval is now determined dead,
-    /// remove the val# it defines. If the live interval becomes empty, remove
-    /// it as well.
-    bool RemoveDeadDef(LiveInterval &li, MachineInstr *DefMI);
-
-    /// RemoveCopyFlag - If DstReg is no longer defined by CopyMI, clear the
-    /// VNInfo copy flag for DstReg and all aliases.
-    void RemoveCopyFlag(unsigned DstReg, const MachineInstr *CopyMI);
-
-    /// markAsJoined - Remember that CopyMI has already been joined.
-    void markAsJoined(MachineInstr *CopyMI);
-
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    RegisterCoalescer() : MachineFunctionPass(ID) {
-      initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
-    }
-
-    /// Register allocators must call this from their own
-    /// getAnalysisUsage to cover the case where the coalescer is not
-    /// a Pass in the proper sense and isn't managed by PassManager.
-    /// PassManager needs to know which analyses to make available and
-    /// which to invalidate when running the register allocator or any
-    /// pass that might call coalescing.  The long-term solution is to
-    /// allow hierarchies of PassManagers.
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-
-    virtual void releaseMemory();
-
-    /// runOnMachineFunction - pass entry point
-    virtual bool runOnMachineFunction(MachineFunction&);
-
-    /// print - Implement the dump method.
-    virtual void print(raw_ostream &O, const Module* = 0) const;
-  };
 
   /// CoalescerPair - A helper class for register coalescers. When deciding if
   /// two registers can be coalesced, CoalescerPair can determine if a copy
   /// instruction would become an identity copy after coalescing.
   class CoalescerPair {
-    const TargetInstrInfo &tii_;
-    const TargetRegisterInfo &tri_;
+    const TargetInstrInfo &TII;
+    const TargetRegisterInfo &TRI;
 
-    /// dstReg_ - The register that will be left after coalescing. It can be a
+    /// DstReg - The register that will be left after coalescing. It can be a
     /// virtual or physical register.
-    unsigned dstReg_;
+    unsigned DstReg;
 
-    /// srcReg_ - the virtual register that will be coalesced into dstReg.
-    unsigned srcReg_;
+    /// SrcReg - the virtual register that will be coalesced into dstReg.
+    unsigned SrcReg;
 
-    /// subReg_ - The subregister index of srcReg in dstReg_. It is possible the
-    /// coalesce srcReg_ into a subreg of the larger dstReg_ when dstReg_ is a
+    /// subReg_ - The subregister index of srcReg in DstReg. It is possible the
+    /// coalesce SrcReg into a subreg of the larger DstReg when DstReg is a
     /// virtual register.
-    unsigned subIdx_;
+    unsigned SubIdx;
 
-    /// partial_ - True when the original copy was a partial subregister copy.
-    bool partial_;
+    /// Partial - True when the original copy was a partial subregister copy.
+    bool Partial;
 
-    /// crossClass_ - True when both regs are virtual, and newRC is constrained.
-    bool crossClass_;
+    /// CrossClass - True when both regs are virtual, and newRC is constrained.
+    bool CrossClass;
 
-    /// flipped_ - True when DstReg and SrcReg are reversed from the oriignal copy
-    /// instruction.
-    bool flipped_;
+    /// Flipped - True when DstReg and SrcReg are reversed from the oriignal
+    /// copy instruction.
+    bool Flipped;
 
-    /// newRC_ - The register class of the coalesced register, or NULL if dstReg_
+    /// NewRC - The register class of the coalesced register, or NULL if DstReg
     /// is a physreg.
-    const TargetRegisterClass *newRC_;
+    const TargetRegisterClass *NewRC;
 
   public:
     CoalescerPair(const TargetInstrInfo &tii, const TargetRegisterInfo &tri)
-      : tii_(tii), tri_(tri), dstReg_(0), srcReg_(0), subIdx_(0),
-        partial_(false), crossClass_(false), flipped_(false), newRC_(0) {}
+      : TII(tii), TRI(tri), DstReg(0), SrcReg(0), SubIdx(0),
+        Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
 
     /// setRegisters - set registers to match the copy instruction MI. Return
     /// false if MI is not a coalescable copy instruction.
     bool setRegisters(const MachineInstr*);
 
-    /// flip - Swap srcReg_ and dstReg_. Return false if swapping is impossible
-    /// because dstReg_ is a physical register, or subIdx_ is set.
+    /// flip - Swap SrcReg and DstReg. Return false if swapping is impossible
+    /// because DstReg is a physical register, or SubIdx is set.
     bool flip();
 
     /// isCoalescable - Return true if MI is a copy instruction that will become
@@ -211,32 +73,33 @@ namespace llvm {
     bool isCoalescable(const MachineInstr*) const;
 
     /// isPhys - Return true if DstReg is a physical register.
-    bool isPhys() const { return !newRC_; }
+    bool isPhys() const { return !NewRC; }
 
-    /// isPartial - Return true if the original copy instruction did not copy the
-    /// full register, but was a subreg operation.
-    bool isPartial() const { return partial_; }
+    /// isPartial - Return true if the original copy instruction did not copy
+    /// the full register, but was a subreg operation.
+    bool isPartial() const { return Partial; }
 
-    /// isCrossClass - Return true if DstReg is virtual and NewRC is a smaller register class than DstReg's.
-    bool isCrossClass() const { return crossClass_; }
+    /// isCrossClass - Return true if DstReg is virtual and NewRC is a smaller
+    /// register class than DstReg's.
+    bool isCrossClass() const { return CrossClass; }
 
     /// isFlipped - Return true when getSrcReg is the register being defined by
     /// the original copy instruction.
-    bool isFlipped() const { return flipped_; }
+    bool isFlipped() const { return Flipped; }
 
     /// getDstReg - Return the register (virtual or physical) that will remain
     /// after coalescing.
-    unsigned getDstReg() const { return dstReg_; }
+    unsigned getDstReg() const { return DstReg; }
 
     /// getSrcReg - Return the virtual register that will be coalesced away.
-    unsigned getSrcReg() const { return srcReg_; }
+    unsigned getSrcReg() const { return SrcReg; }
 
     /// getSubIdx - Return the subregister index in DstReg that SrcReg will be
     /// coalesced into, or 0.
-    unsigned getSubIdx() const { return subIdx_; }
+    unsigned getSubIdx() const { return SubIdx; }
 
     /// getNewRC - Return the register class of the coalesced register.
-    const TargetRegisterClass *getNewRC() const { return newRC_; }
+    const TargetRegisterClass *getNewRC() const { return NewRC; }
   };
 } // End llvm namespace
 
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 9e9a145..ca02aa1 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -206,6 +206,7 @@ void RegScavenger::forward() {
             break;
           }
         assert(SubUsed && "Using an undefined register!");
+        (void)SubUsed;
       }
       assert((!EarlyClobberRegs.test(Reg) || MI->isRegTiedToDefOperand(i)) &&
              "Using an early clobbered register!");
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 21375b2..1e9b5c8 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -26,7 +26,7 @@
 using namespace llvm;
 
 #ifndef NDEBUG
-cl::opt<bool> StressSchedOpt(
+static cl::opt<bool> StressSchedOpt(
   "stress-sched", cl::Hidden, cl::init(false),
   cl::desc("Stress test instruction scheduling"));
 #endif
@@ -140,6 +140,7 @@ void SUnit::removePred(const SDep &D) {
           break;
         }
       assert(FoundSucc && "Mismatching preds / succs lists!");
+      (void)FoundSucc;
       Preds.erase(I);
       // Update the bookkeeping.
       if (P.getKind() == SDep::Data) {
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 446adfc..34b8ab0 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -36,7 +36,7 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineDominatorTree &mdt)
   : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
     InstrItins(mf.getTarget().getInstrItineraryData()),
-    Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()), 
+    Defs(TRI->getNumRegs()), Uses(TRI->getNumRegs()),
     LoopRegs(MLI, MDT), FirstDbgValue(0) {
   DbgValues.clear();
 }
@@ -134,6 +134,7 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
 }
 
 void ScheduleDAGInstrs::StartBlock(MachineBasicBlock *BB) {
+  LoopRegs.Deps.clear();
   if (MachineLoop *ML = MLI.getLoopFor(BB))
     if (BB == ML->getLoopLatch()) {
       MachineBasicBlock *Header = ML->getHeader();
diff --git a/lib/CodeGen/ScheduleDAGInstrs.h b/lib/CodeGen/ScheduleDAGInstrs.h
index 8a4ea85..666bdf5 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.h
+++ b/lib/CodeGen/ScheduleDAGInstrs.h
@@ -48,7 +48,8 @@ namespace llvm {
     /// VisitLoop - Clear out any previous state and analyze the given loop.
     ///
     void VisitLoop(const MachineLoop *Loop) {
-      Deps.clear();
+      assert(Deps.empty() && "stale loop dependencies");
+
       MachineBasicBlock *Header = Loop->getHeader();
       SmallSet<unsigned, 8> LoopLiveIns;
       for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(),
@@ -109,7 +110,7 @@ namespace llvm {
     /// initialized and destructed for each block.
     std::vector<std::vector<SUnit *> > Defs;
     std::vector<std::vector<SUnit *> > Uses;
- 
+
     /// PendingLoads - Remember where unknown loads are after the most recent
     /// unknown store, as we iterate. As with Defs and Uses, this is here
     /// to minimize construction/destruction.
@@ -127,7 +128,7 @@ namespace llvm {
   protected:
 
     /// DbgValues - Remember instruction that preceeds DBG_VALUE.
-    typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
+    typedef std::vector<std::pair<MachineInstr *, MachineInstr *> >
       DbgValueVector;
     DbgValueVector DbgValues;
     MachineInstr *FirstDbgValue;
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 0e005d3..b80c01e 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -213,7 +213,6 @@ void ScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
         freeUnits = freeUnit & (freeUnit - 1);
       } while (freeUnits);
 
-      assert(freeUnit && "No function unit available!");
       if (IS->getReservationKind() == InstrStage::Required)
         RequiredScoreboard[cycle + i] |= freeUnit;
       else
diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 15932c0..2282f0e 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -21,3 +21,13 @@ add_llvm_library(LLVMSelectionDAG
   TargetLowering.cpp
   TargetSelectionDAGInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMSelectionDAG
+  LLVMAnalysis
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4f0d2ca..7b87868 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -216,6 +216,7 @@ namespace {
     SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
     SDValue visitBUILD_VECTOR(SDNode *N);
     SDValue visitCONCAT_VECTORS(SDNode *N);
+    SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitMEMBARRIER(SDNode *N);
 
@@ -1105,6 +1106,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
   case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
   case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
+  case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
   case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
   case ISD::MEMBARRIER:         return visitMEMBARRIER(N);
   }
@@ -1526,12 +1528,6 @@ SDValue DAGCombiner::visitADDE(SDNode *N) {
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
 
-  // If both operands are null we know that carry out will always be false.
-  if (N0C && N0C->isNullValue() && N0 == N1)
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), DAG.getNode(ISD::CARRY_FALSE,
-                                                             N->getDebugLoc(),
-                                                             MVT::Glue));
-
   // canonicalize constant to RHS
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDE, N->getDebugLoc(), N->getVTList(),
@@ -3763,7 +3759,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (VT.isInteger() &&
       (VT0 == MVT::i1 ||
        (VT0.isInteger() &&
-        TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent)) &&
+        TLI.getBooleanContents(false) == TargetLowering::ZeroOrOneBooleanContent)) &&
       N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
     SDValue XORNode;
     if (VT == VT0)
@@ -4118,7 +4114,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
       if (VT.getSizeInBits() == N0VT.getSizeInBits())
-        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+        return DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
@@ -4132,7 +4128,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
           EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
                            N0VT.getVectorNumElements());
         SDValue VsetCC =
-          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+          DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
                         N0.getOperand(1),
                         cast<CondCodeSDNode>(N0.getOperand(2))->get());
         return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
@@ -4348,7 +4344,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
         return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
-                           DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+                           DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
                                          N0.getOperand(1),
                                  cast<CondCodeSDNode>(N0.getOperand(2))->get()),
                            DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
@@ -4364,7 +4360,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
                          N0VT.getVectorNumElements());
       SDValue VsetCC =
-        DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+        DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
                       N0.getOperand(1),
                       cast<CondCodeSDNode>(N0.getOperand(2))->get());
       return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
@@ -4532,7 +4528,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
       if (VT.getSizeInBits() == N0VT.getSizeInBits())
-        return DAG.getVSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+        return DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
@@ -4546,7 +4542,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
           EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
                            N0VT.getVectorNumElements());
         SDValue VsetCC =
-          DAG.getVSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+          DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
                         N0.getOperand(1),
                         cast<CondCodeSDNode>(N0.getOperand(2))->get());
         return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
@@ -6479,7 +6475,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
         PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
 
       unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
-      const Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
+      Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
       if (NewAlign < TLI.getTargetData()->getABITypeAlignment(NewVTTy))
         return SDValue();
 
@@ -6542,7 +6538,7 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
 
     unsigned LDAlign = LD->getAlignment();
     unsigned STAlign = ST->getAlignment();
-    const Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
+    Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
     unsigned ABIAlign = TLI.getTargetData()->getABITypeAlignment(IntVTTy);
     if (LDAlign < ABIAlign || STAlign < ABIAlign)
       return SDValue();
@@ -6776,6 +6772,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
   SDValue EltNo = N->getOperand(2);
+  DebugLoc dl = N->getDebugLoc();
 
   // If the inserted element is an UNDEF, just use the input vector.
   if (InVal.getOpcode() == ISD::UNDEF)
@@ -6787,32 +6784,40 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
     return SDValue();
 
-  // If the invec is a BUILD_VECTOR and if EltNo is a constant, build a new
-  // vector with the inserted element.
-  if (InVec.getOpcode() == ISD::BUILD_VECTOR && isa<ConstantSDNode>(EltNo)) {
-    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-    SmallVector<SDValue, 8> Ops(InVec.getNode()->op_begin(),
-                                InVec.getNode()->op_end());
-    if (Elt < Ops.size())
-      Ops[Elt] = InVal;
-    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
-                       VT, &Ops[0], Ops.size());
-  }
-  // If the invec is an UNDEF and if EltNo is a constant, create a new
-  // BUILD_VECTOR with undef elements and the inserted element.
-  if (InVec.getOpcode() == ISD::UNDEF &&
-      isa<ConstantSDNode>(EltNo)) {
-    EVT EltVT = VT.getVectorElementType();
+  // Check that we know which element is being inserted
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+  // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+  // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+  // vector elements.
+  SmallVector<SDValue, 8> Ops;
+  if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+    Ops.append(InVec.getNode()->op_begin(),
+               InVec.getNode()->op_end());
+  } else if (InVec.getOpcode() == ISD::UNDEF) {
     unsigned NElts = VT.getVectorNumElements();
-    SmallVector<SDValue, 8> Ops(NElts, DAG.getUNDEF(EltVT));
+    Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+  } else {
+    return SDValue();
+  }
 
-    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-    if (Elt < Ops.size())
-      Ops[Elt] = InVal;
-    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
-                       VT, &Ops[0], Ops.size());
+  // Insert the element
+  if (Elt < Ops.size()) {
+    // All the operands of BUILD_VECTOR must have the same type;
+    // we enforce that here.
+    EVT OpVT = Ops[0].getValueType();
+    if (InVal.getValueType() != OpVT)
+      InVal = OpVT.bitsGT(InVal.getValueType()) ?
+                DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
+                DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
+    Ops[Elt] = InVal;
   }
-  return SDValue();
+
+  // Return the new vector
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,
+                     VT, &Ops[0], Ops.size());
 }
 
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
@@ -6896,7 +6901,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
     // If Idx was -1 above, Elt is going to be -1, so just return undef.
     if (Elt == -1)
-      return DAG.getUNDEF(LN0->getBasePtr().getValueType());
+      return DAG.getUNDEF(LVT);
 
     unsigned Align = LN0->getAlignment();
     if (NewLoad) {
@@ -7028,6 +7033,36 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
+  EVT NVT = N->getValueType(0);
+  SDValue V = N->getOperand(0);
+
+  if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    // Handle only simple case where vector being inserted and vector
+    // being extracted are of same type, and are half size of larger vectors.
+    EVT BigVT = V->getOperand(0).getValueType();
+    EVT SmallVT = V->getOperand(1).getValueType();
+    if (NVT != SmallVT || NVT.getSizeInBits()*2 != BigVT.getSizeInBits())
+      return SDValue();
+
+    // Combine:
+    //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
+    // Into:
+    //    indicies are equal => V1
+    //    otherwise => (extract_subvec V1, ExtIdx)
+    //
+    SDValue InsIdx = N->getOperand(1);
+    SDValue ExtIdx = V->getOperand(2);
+
+    if (InsIdx == ExtIdx)
+      return V->getOperand(1);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, N->getDebugLoc(), NVT,
+                       V->getOperand(0), N->getOperand(1));
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -7447,7 +7482,7 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
           const_cast<ConstantFP*>(FV->getConstantFPValue()),
           const_cast<ConstantFP*>(TV->getConstantFPValue())
         };
-        const Type *FPTy = Elts[0]->getType();
+        Type *FPTy = Elts[0]->getType();
         const TargetData &TD = *TLI.getTargetData();
 
         // Create a ConstantArray of the two constants.
@@ -7465,10 +7500,13 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
         SDValue Cond = DAG.getSetCC(DL,
                                     TLI.getSetCCResultType(N0.getValueType()),
                                     N0, N1, CC);
+        AddToWorkList(Cond.getNode());
         SDValue CstOffset = DAG.getNode(ISD::SELECT, DL, Zero.getValueType(),
                                         Cond, One, Zero);
+        AddToWorkList(CstOffset.getNode());
         CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
                             CstOffset);
+        AddToWorkList(CPIdx.getNode());
         return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
                            MachinePointerInfo::getConstantPool(), false,
                            false, Alignment);
@@ -7553,7 +7591,8 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
 
   // fold select C, 16, 0 -> shl C, 4
   if (N2C && N3C && N3C->isNullValue() && N2C->getAPIntValue().isPowerOf2() &&
-      TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent) {
+    TLI.getBooleanContents(N0.getValueType().isVector()) ==
+      TargetLowering::ZeroOrOneBooleanContent) {
 
     // If the caller doesn't want us to simplify this into a zext of a compare,
     // don't do it.
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 54a7d43..e8f8c73 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -66,17 +66,22 @@ using namespace llvm;
 void FastISel::startNewBlock() {
   LocalValueMap.clear();
 
-  // Start out as null, meaining no local-value instructions have
-  // been emitted.
-  LastLocalValue = 0;
+  EmitStartPt = 0;
 
-  // Advance the last local value past any EH_LABEL instructions.
+  // Advance the emit start point past any EH_LABEL instructions.
   MachineBasicBlock::iterator
     I = FuncInfo.MBB->begin(), E = FuncInfo.MBB->end();
   while (I != E && I->getOpcode() == TargetOpcode::EH_LABEL) {
-    LastLocalValue = I;
+    EmitStartPt = I;
     ++I;
   }
+  LastLocalValue = EmitStartPt;
+}
+
+void FastISel::flushLocalValueMap() {
+  LocalValueMap.clear();
+  LastLocalValue = EmitStartPt;
+  recomputeInsertPt();
 }
 
 bool FastISel::hasTrivialKill(const Value *V) const {
@@ -183,7 +188,7 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
       (void) Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
                                 APFloat::rmTowardZero, &isExact);
       if (isExact) {
-        APInt IntVal(IntBitWidth, 2, x);
+        APInt IntVal(IntBitWidth, x);
 
         unsigned IntegerReg =
           getRegForValue(ConstantInt::get(V->getContext(), IntVal));
@@ -422,12 +427,12 @@ bool FastISel::SelectGetElementPtr(const User *I) {
 
   bool NIsKill = hasTrivialKill(I->getOperand(0));
 
-  const Type *Ty = I->getOperand(0)->getType();
+  Type *Ty = I->getOperand(0)->getType();
   MVT VT = TLI.getPointerTy();
   for (GetElementPtrInst::const_op_iterator OI = I->op_begin()+1,
        E = I->op_end(); OI != E; ++OI) {
     const Value *Idx = *OI;
-    if (const StructType *StTy = dyn_cast<StructType>(Ty)) {
+    if (StructType *StTy = dyn_cast<StructType>(Ty)) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       if (Field) {
         // N = N + Offset
@@ -489,7 +494,7 @@ bool FastISel::SelectCall(const User *I) {
   const CallInst *Call = cast<CallInst>(I);
 
   // Handle simple inline asms.
-  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getArgOperand(0))) {
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledValue())) {
     // Don't attempt to handle constraints.
     if (!IA->getConstraintString().empty())
       return false;
@@ -526,13 +531,10 @@ bool FastISel::SelectCall(const User *I) {
     unsigned Reg = 0;
     unsigned Offset = 0;
     if (const Argument *Arg = dyn_cast<Argument>(Address)) {
-      if (Arg->hasByValAttr()) {
-        // Byval arguments' frame index is recorded during argument lowering.
-        // Use this info directly.
-        Offset = FuncInfo.getByValArgumentFrameIndex(Arg);
-        if (Offset)
-          Reg = TRI.getFrameRegister(*FuncInfo.MF);
-      }
+      // Some arguments' frame index is recorded during argument lowering.
+      Offset = FuncInfo.getArgumentFrameIndex(Arg);
+      if (Offset)
+	Reg = TRI.getFrameRegister(*FuncInfo.MF);
     }
     if (!Reg)
       Reg = getRegForValue(Address);
@@ -645,6 +647,16 @@ bool FastISel::SelectCall(const User *I) {
   }
   }
 
+  // Usually, it does not make sense to initialize a value,
+  // make an unrelated function call and use the value, because
+  // it tends to be spilled on the stack. So, we move the pointer
+  // to the last local value to the beginning of the block, so that
+  // all the values which have already been materialized,
+  // appear after the call. It also makes sense to skip intrinsics
+  // since they tend to be inlined.
+  if (!isa<IntrinsicInst>(F))
+    flushLocalValueMap();
+
   // An arbitrary call. Bail.
   return false;
 }
@@ -839,7 +851,7 @@ FastISel::SelectExtractValue(const User *U) {
     return false;
 
   const Value *Op0 = EVI->getOperand(0);
-  const Type *AggTy = Op0->getType();
+  Type *AggTy = Op0->getType();
 
   // Get the base result register.
   unsigned ResultReg;
@@ -1074,7 +1086,7 @@ unsigned FastISel::FastEmit_ri_(MVT VT, unsigned Opcode,
   if (MaterialReg == 0) {
     // This is a bit ugly/slow, but failing here means falling out of
     // fast-isel, which would be very slow.
-    const IntegerType *ITy = IntegerType::get(FuncInfo.Fn->getContext(),
+    IntegerType *ITy = IntegerType::get(FuncInfo.Fn->getContext(),
                                               VT.getSizeInBits());
     MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm));
   }
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index d518b5d..b052740 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -78,7 +78,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
   for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(I))
       if (const ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
-        const Type *Ty = AI->getAllocatedType();
+        Type *Ty = AI->getAllocatedType();
         uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
         unsigned Align =
           std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
@@ -216,7 +216,7 @@ unsigned FunctionLoweringInfo::CreateReg(EVT VT) {
 /// In the case that the given value has struct or array type, this function
 /// will assign registers for each member or element.
 ///
-unsigned FunctionLoweringInfo::CreateRegs(const Type *Ty) {
+unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(TLI, Ty, ValueVTs);
 
@@ -260,7 +260,7 @@ FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) {
 /// ComputePHILiveOutRegInfo - Compute LiveOutInfo for a PHI's destination
 /// register based on the LiveOutInfo of its operands.
 void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
-  const Type *Ty = PN->getType();
+  Type *Ty = PN->getType();
   if (!Ty->isIntegerTy() || Ty->isVectorTy())
     return;
 
@@ -351,20 +351,18 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   }
 }
 
-/// setByValArgumentFrameIndex - Record frame index for the byval
+/// setArgumentFrameIndex - Record frame index for the byval
 /// argument. This overrides previous frame index entry for this argument,
 /// if any.
-void FunctionLoweringInfo::setByValArgumentFrameIndex(const Argument *A,
+void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
                                                       int FI) {
-  assert (A->hasByValAttr() && "Argument does not have byval attribute!");
   ByValArgFrameIndexMap[A] = FI;
 }
 
-/// getByValArgumentFrameIndex - Get frame index for the byval argument.
+/// getArgumentFrameIndex - Get frame index for the byval argument.
 /// If the argument does not have any assigned frame index then 0 is
 /// returned.
-int FunctionLoweringInfo::getByValArgumentFrameIndex(const Argument *A) {
-  assert (A->hasByValAttr() && "Argument does not have byval attribute!");
+int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
   DenseMap<const Argument *, int>::iterator I =
     ByValArgFrameIndexMap.find(A);
   if (I != ByValArgFrameIndexMap.end())
@@ -454,3 +452,34 @@ void llvm::CopyCatchInfo(const BasicBlock *SuccBB, const BasicBlock *LPad,
       break;
   }
 }
+
+/// AddLandingPadInfo - Extract the exception handling information from the
+/// landingpad instruction and add them to the specified machine module info.
+void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
+                             MachineBasicBlock *MBB) {
+  MMI.addPersonality(MBB,
+                     cast<Function>(I.getPersonalityFn()->stripPointerCasts()));
+
+  if (I.isCleanup())
+    MMI.addCleanup(MBB);
+
+  // FIXME: New EH - Add the clauses in reverse order. This isn't 100% correct,
+  //        but we need to do it this way because of how the DWARF EH emitter
+  //        processes the clauses.
+  for (unsigned i = I.getNumClauses(); i != 0; --i) {
+    Value *Val = I.getClause(i - 1);
+    if (I.isCatch(i - 1)) {
+      MMI.addCatchTypeInfo(MBB,
+                           dyn_cast<GlobalVariable>(Val->stripPointerCasts()));
+    } else {
+      // Add filters in a list.
+      Constant *CVal = cast<Constant>(Val);
+      SmallVector<const GlobalVariable*, 4> FilterList;
+      for (User::op_iterator
+             II = CVal->op_begin(), IE = CVal->op_end(); II != IE; ++II)
+        FilterList.push_back(cast<GlobalVariable>((*II)->stripPointerCasts()));
+
+      MMI.addFilterTypeInfo(MBB, FilterList);
+    }
+  }
+}
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index f0f4743..2ff66f8 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -30,6 +30,12 @@
 #include "llvm/Support/MathExtras.h"
 using namespace llvm;
 
+/// MinRCSize - Smallest register class we allow when constraining virtual
+/// registers.  If satisfying all register class constraints would require
+/// using a smaller register class, emit a COPY to a new virtual register
+/// instead.
+const unsigned MinRCSize = 4;
+
 /// CountResults - The results of target nodes have register or immediate
 /// operands first, then an optional chain, and optional glue operands (which do
 /// not go into the resulting MachineInstr).
@@ -87,7 +93,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
          UI != E; ++UI) {
       SDNode *User = *UI;
       bool Match = true;
-      if (User->getOpcode() == ISD::CopyToReg && 
+      if (User->getOpcode() == ISD::CopyToReg &&
           User->getOperand(2).getNode() == Node &&
           User->getOperand(2).getResNo() == ResNo) {
         unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
@@ -113,7 +119,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
             if (!UseRC)
               UseRC = RC;
             else if (RC) {
-              const TargetRegisterClass *ComRC = getCommonSubClass(UseRC, RC);
+              const TargetRegisterClass *ComRC =
+                TRI->getCommonSubClass(UseRC, RC);
               // If multiple uses expect disjoint register classes, we emit
               // copies in AddRegisterOperand.
               if (ComRC)
@@ -139,7 +146,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
   } else {
     DstRC = TLI->getRegClassFor(VT);
   }
-    
+
   // If all uses are reading from the src physical register and copying the
   // register is either impossible or very expensive, then don't create a copy.
   if (MatchReg && SrcRC->getCopyCost() < 0) {
@@ -167,7 +174,7 @@ unsigned InstrEmitter::getDstOfOnlyCopyToRegUse(SDNode *Node,
     return 0;
 
   SDNode *User = *Node->use_begin();
-  if (User->getOpcode() == ISD::CopyToReg && 
+  if (User->getOpcode() == ISD::CopyToReg &&
       User->getOperand(2).getNode() == Node &&
       User->getOperand(2).getResNo() == ResNo) {
     unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
@@ -202,7 +209,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node, MachineInstr *MI,
       for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
            UI != E; ++UI) {
         SDNode *User = *UI;
-        if (User->getOpcode() == ISD::CopyToReg && 
+        if (User->getOpcode() == ISD::CopyToReg &&
             User->getOperand(2).getNode() == Node &&
             User->getOperand(2).getResNo() == i) {
           unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
@@ -280,15 +287,16 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
     MCID.OpInfo[IIOpNum].isOptionalDef();
 
   // If the instruction requires a register in a different class, create
-  // a new virtual register and copy the value into it.
+  // a new virtual register and copy the value into it, but first attempt to
+  // shrink VReg's register class within reason.  For example, if VReg == GR32
+  // and II requires a GR32_NOSP, just constrain VReg to GR32_NOSP.
   if (II) {
-    const TargetRegisterClass *SrcRC = MRI->getRegClass(VReg);
     const TargetRegisterClass *DstRC = 0;
     if (IIOpNum < II->getNumOperands())
       DstRC = TII->getRegClass(*II, IIOpNum, TRI);
     assert((DstRC || (MCID.isVariadic() && IIOpNum >= MCID.getNumOperands())) &&
            "Don't have operand info for this instruction!");
-    if (DstRC && !SrcRC->hasSuperClassEq(DstRC)) {
+    if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
               TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
@@ -326,7 +334,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
 
 /// AddOperand - Add the specified operand to the specified machine instr.  II
 /// specifies the instruction information for the node, and IIOpNum is the
-/// operand number (in the II) that we are adding. IIOpNum and II are used for 
+/// operand number (in the II) that we are adding. IIOpNum and II are used for
 /// assertions only.
 void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
                               unsigned IIOpNum,
@@ -356,7 +364,7 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
   } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op)) {
     int Offset = CP->getOffset();
     unsigned Align = CP->getAlignment();
-    const Type *Type = CP->getType();
+    Type *Type = CP->getType();
     // MachineConstantPool wants an explicit alignment.
     if (Align == 0) {
       Align = TM->getTargetData()->getPrefTypeAlignment(Type);
@@ -365,7 +373,7 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
         Align = TM->getTargetData()->getTypeAllocSize(Type);
       }
     }
-    
+
     unsigned Idx;
     MachineConstantPool *MCP = MF->getConstantPool();
     if (CP->isMachineConstantPoolEntry())
@@ -389,35 +397,44 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
   }
 }
 
-/// getSuperRegisterRegClass - Returns the register class of a superreg A whose
-/// "SubIdx"'th sub-register class is the specified register class and whose
-/// type matches the specified type.
-static const TargetRegisterClass*
-getSuperRegisterRegClass(const TargetRegisterClass *TRC,
-                         unsigned SubIdx, EVT VT) {
-  // Pick the register class of the superegister for this type
-  for (TargetRegisterInfo::regclass_iterator I = TRC->superregclasses_begin(),
-         E = TRC->superregclasses_end(); I != E; ++I)
-    if ((*I)->hasType(VT) && (*I)->getSubRegisterRegClass(SubIdx) == TRC)
-      return *I;
-  assert(false && "Couldn't find the register class");
-  return 0;
+unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
+                                          EVT VT, DebugLoc DL) {
+  const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
+  const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
+
+  // RC is a sub-class of VRC that supports SubIdx.  Try to constrain VReg
+  // within reason.
+  if (RC && RC != VRC)
+    RC = MRI->constrainRegClass(VReg, RC, MinRCSize);
+
+  // VReg has been adjusted.  It can be used with SubIdx operands now.
+  if (RC)
+    return VReg;
+
+  // VReg couldn't be reasonably constrained.  Emit a COPY to a new virtual
+  // register instead.
+  RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT), SubIdx);
+  assert(RC && "No legal register class for VT supports that SubIdx");
+  unsigned NewReg = MRI->createVirtualRegister(RC);
+  BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
+    .addReg(VReg);
+  return NewReg;
 }
 
 /// EmitSubregNode - Generate machine code for subreg nodes.
 ///
-void InstrEmitter::EmitSubregNode(SDNode *Node, 
+void InstrEmitter::EmitSubregNode(SDNode *Node,
                                   DenseMap<SDValue, unsigned> &VRBaseMap,
                                   bool IsClone, bool IsCloned) {
   unsigned VRBase = 0;
   unsigned Opc = Node->getMachineOpcode();
-  
+
   // If the node is only used by a CopyToReg and the dest reg is a vreg, use
   // the CopyToReg'd destination register instead of creating a new vreg.
   for (SDNode::use_iterator UI = Node->use_begin(), E = Node->use_end();
        UI != E; ++UI) {
     SDNode *User = *UI;
-    if (User->getOpcode() == ISD::CopyToReg && 
+    if (User->getOpcode() == ISD::CopyToReg &&
         User->getOperand(2).getNode() == Node) {
       unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
       if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
@@ -426,12 +443,14 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       }
     }
   }
-  
+
   if (Opc == TargetOpcode::EXTRACT_SUBREG) {
-    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub
+    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub.  There are no
+    // constraints on the %dst register, COPY can target all legal register
+    // classes.
     unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const TargetRegisterClass *TRC = TLI->getRegClassFor(Node->getValueType(0));
 
-    // Figure out the register class to create for the destreg.
     unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
     MachineInstr *DefMI = MRI->getVRegDef(VReg);
     unsigned SrcReg, DstReg, DefSubIdx;
@@ -443,62 +462,57 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       // r1026 = extract_subreg r1025, 4
       // to a copy
       // r1026 = copy r1024
-      const TargetRegisterClass *TRC = MRI->getRegClass(SrcReg);
       VRBase = MRI->createVirtualRegister(TRC);
       BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
               TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
     } else {
-      const TargetRegisterClass *TRC = MRI->getRegClass(VReg);
-      const TargetRegisterClass *SRC = TRC->getSubRegisterRegClass(SubIdx);
-      assert(SRC && "Invalid subregister index in EXTRACT_SUBREG");
-
-      // Figure out the register class to create for the destreg.
-      // Note that if we're going to directly use an existing register,
-      // it must be precisely the required class, and not a subclass
-      // thereof.
-      if (VRBase == 0 || SRC != MRI->getRegClass(VRBase)) {
-        // Create the reg
-        assert(SRC && "Couldn't find source register class");
-        VRBase = MRI->createVirtualRegister(SRC);
-      }
+      // VReg may not support a SubIdx sub-register, and we may need to
+      // constrain its register class or issue a COPY to a compatible register
+      // class.
+      VReg = ConstrainForSubReg(VReg, SubIdx,
+                                Node->getOperand(0).getValueType(),
+                                Node->getDebugLoc());
 
-      // Create the extract_subreg machine instruction.
-      MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
-                                 TII->get(TargetOpcode::COPY), VRBase);
+      // Create the destreg if it is missing.
+      if (VRBase == 0)
+        VRBase = MRI->createVirtualRegister(TRC);
 
-      // Add source, and subreg index
-      AddOperand(MI, Node->getOperand(0), 0, 0, VRBaseMap, /*IsDebug=*/false,
-                 IsClone, IsCloned);
-      assert(TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg())&&
-             "Cannot yet extract from physregs");
-      MI->getOperand(1).setSubReg(SubIdx);
-      MBB->insert(InsertPos, MI);
+      // Create the extract_subreg machine instruction.
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), VRBase).addReg(VReg, 0, SubIdx);
     }
   } else if (Opc == TargetOpcode::INSERT_SUBREG ||
              Opc == TargetOpcode::SUBREG_TO_REG) {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
     SDValue N2 = Node->getOperand(2);
-    unsigned SubReg = getVR(N1, VRBaseMap);
     unsigned SubIdx = cast<ConstantSDNode>(N2)->getZExtValue();
-    const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
-    const TargetRegisterClass *SRC =
-      getSuperRegisterRegClass(TRC, SubIdx, Node->getValueType(0));
-
-    // Figure out the register class to create for the destreg.
-    // Note that if we're going to directly use an existing register,
-    // it must be precisely the required class, and not a subclass
-    // thereof.
-    if (VRBase == 0 || SRC != MRI->getRegClass(VRBase)) {
-      // Create the reg
-      assert(SRC && "Couldn't find source register class");
+
+    // Figure out the register class to create for the destreg.  It should be
+    // the largest legal register class supporting SubIdx sub-registers.
+    // RegisterCoalescer will constrain it further if it decides to eliminate
+    // the INSERT_SUBREG instruction.
+    //
+    //   %dst = INSERT_SUBREG %src, %sub, SubIdx
+    //
+    // is lowered by TwoAddressInstructionPass to:
+    //
+    //   %dst = COPY %src
+    //   %dst:SubIdx = COPY %sub
+    //
+    // There is no constraint on the %src register class.
+    //
+    const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getValueType(0));
+    SRC = TRI->getSubClassWithSubReg(SRC, SubIdx);
+    assert(SRC && "No register class supports VT and SubIdx for INSERT_SUBREG");
+
+    if (VRBase == 0 || !SRC->hasSubClassEq(MRI->getRegClass(VRBase)))
       VRBase = MRI->createVirtualRegister(SRC);
-    }
 
     // Create the insert_subreg or subreg_to_reg machine instruction.
     MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(), TII->get(Opc));
     MI->addOperand(MachineOperand::CreateReg(VRBase, true));
-    
+
     // If creating a subreg_to_reg, then the first input operand
     // is an implicit value immediate, otherwise it's a register
     if (Opc == TargetOpcode::SUBREG_TO_REG) {
@@ -514,7 +528,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     MBB->insert(InsertPos, MI);
   } else
     llvm_unreachable("Node is not insert_subreg, extract_subreg, or subreg_to_reg");
-     
+
   SDValue Op(Node, 0);
   bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
   (void)isNew; // Silence compiler warning.
@@ -643,9 +657,9 @@ void InstrEmitter::
 EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
                 DenseMap<SDValue, unsigned> &VRBaseMap) {
   unsigned Opc = Node->getMachineOpcode();
-  
+
   // Handle subreg insert/extract specially
-  if (Opc == TargetOpcode::EXTRACT_SUBREG || 
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
       Opc == TargetOpcode::INSERT_SUBREG ||
       Opc == TargetOpcode::SUBREG_TO_REG) {
     EmitSubregNode(Node, VRBaseMap, IsClone, IsCloned);
@@ -667,7 +681,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   if (Opc == TargetOpcode::IMPLICIT_DEF)
     // We want a unique VR for each IMPLICIT_DEF use.
     return;
-  
+
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
   unsigned NodeOperands = CountOperands(Node);
@@ -712,12 +726,12 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
     // Then mark unused registers as dead.
     MI->setPhysRegsDeadExcept(UsedRegs, *TRI);
   }
-  
+
   // Add result register values for things that are defined by this
   // instruction.
   if (NumResults)
     CreateVirtualRegisters(Node, MI, II, IsClone, IsCloned, VRBaseMap);
-  
+
   // Emit all of the actual operands of this instruction, adding them to the
   // instruction as appropriate.
   bool HasOptPRefs = II.getNumDefs() > NumResults;
@@ -751,7 +765,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
         MI->addRegisterDead(Reg, TRI);
     }
   }
-  
+
   // If the instruction has implicit defs and the node doesn't, mark the
   // implicit def as dead.  If the node has any glue outputs, we don't do this
   // because we don't know what implicit defs are being used by glued nodes.
@@ -761,6 +775,12 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
            i != e; ++i)
         MI->addRegisterDead(IDList[i-II.getNumDefs()], TRI);
     }
+
+  // Run post-isel target hook to adjust this instruction if needed.
+#ifdef NDEBUG
+  if (II.hasPostISelHook())
+#endif
+    TLI->AdjustInstrPostInstrSelection(MI, Node);
 }
 
 /// EmitSpecialNode - Generate machine code for a target-independent node and
@@ -788,7 +808,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
       SrcReg = R->getReg();
     else
       SrcReg = getVR(SrcVal, VRBaseMap);
-      
+
     unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
     if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
       break;
@@ -808,12 +828,12 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
             TII->get(TargetOpcode::EH_LABEL)).addSym(S);
     break;
   }
-      
+
   case ISD::INLINEASM: {
     unsigned NumOps = Node->getNumOperands();
     if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
       --NumOps;  // Ignore the glue operand.
-      
+
     // Create the inline asm machine instruction.
     MachineInstr *MI = BuildMI(*MF, Node->getDebugLoc(),
                                TII->get(TargetOpcode::INLINEASM));
@@ -822,7 +842,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     SDValue AsmStrV = Node->getOperand(InlineAsm::Op_AsmString);
     const char *AsmStr = cast<ExternalSymbolSDNode>(AsmStrV)->getSymbol();
     MI->addOperand(MachineOperand::CreateES(AsmStr));
-      
+
     // Add the HasSideEffect and isAlignStack bits.
     int64_t ExtraInfo =
       cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_ExtraInfo))->
@@ -834,10 +854,10 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
       unsigned Flags =
         cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
       unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
-        
+
       MI->addOperand(MachineOperand::CreateImm(Flags));
       ++i;  // Skip the ID value.
-        
+
       switch (InlineAsm::getKind(Flags)) {
       default: llvm_unreachable("Bad flags!");
         case InlineAsm::Kind_RegDef:
@@ -873,13 +893,13 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
         break;
       }
     }
-    
+
     // Get the mdnode from the asm if it exists and add it to the instruction.
     SDValue MDV = Node->getOperand(InlineAsm::Op_MDNode);
     const MDNode *MD = cast<MDNodeSDNode>(MDV)->getMD();
     if (MD)
       MI->addOperand(MachineOperand::CreateMetadata(MD));
-    
+
     MBB->insert(InsertPos, MI);
     break;
   }
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 19fc044..c081f38 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -77,6 +77,12 @@ class InstrEmitter {
                   DenseMap<SDValue, unsigned> &VRBaseMap,
                   bool IsDebug, bool IsClone, bool IsCloned);
 
+  /// ConstrainForSubReg - Try to constrain VReg to a register class that
+  /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
+  /// Return the virtual register to use.
+  unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
+                              EVT VT, DebugLoc DL);
+
   /// EmitSubregNode - Generate machine code for subreg nodes.
   ///
   void EmitSubregNode(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d06e2bd..63255ae 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -53,10 +53,15 @@ class SelectionDAGLegalize {
 
   // Libcall insertion helpers.
 
-  /// LastCALLSEQ - This keeps track of the CALLSEQ_END node that has been
+  /// LastCALLSEQ_END - This keeps track of the CALLSEQ_END node that has been
   /// legalized.  We use this to ensure that calls are properly serialized
   /// against each other, including inserted libcalls.
-  SmallVector<SDValue, 8> LastCALLSEQ;
+  SDValue LastCALLSEQ_END;
+
+  /// IsLegalizingCall - This member is used *only* for purposes of providing
+  /// helpful assertions that a libcall isn't created while another call is
+  /// being legalized (which could lead to non-serialized call sequences).
+  bool IsLegalizingCall;
 
   /// LegalizedNodes - For nodes that are of legal width, and that have more
   /// than one use, this map indicates what regularized operand to use.  This
@@ -149,15 +154,6 @@ private:
 
   void ExpandNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void PromoteNode(SDNode *Node, SmallVectorImpl<SDValue> &Results);
-
-  SDValue getLastCALLSEQ() { return LastCALLSEQ.back();  }
-  void setLastCALLSEQ(const SDValue s) { LastCALLSEQ.back() = s; }
-  void pushLastCALLSEQ(SDValue s) {
-    LastCALLSEQ.push_back(s);
-  }
-  void popLastCALLSEQ() {
-    LastCALLSEQ.pop_back();
-  }
 };
 }
 
@@ -199,7 +195,8 @@ SelectionDAGLegalize::SelectionDAGLegalize(SelectionDAG &dag)
 }
 
 void SelectionDAGLegalize::LegalizeDAG() {
-  pushLastCALLSEQ(DAG.getEntryNode());
+  LastCALLSEQ_END = DAG.getEntryNode();
+  IsLegalizingCall = false;
 
   // The legalize process is inherently a bottom-up recursive process (users
   // legalize their uses before themselves).  Given infinite stack space, we
@@ -227,15 +224,14 @@ void SelectionDAGLegalize::LegalizeDAG() {
 /// FindCallEndFromCallStart - Given a chained node that is part of a call
 /// sequence, find the CALLSEQ_END node that terminates the call sequence.
 static SDNode *FindCallEndFromCallStart(SDNode *Node, int depth = 0) {
-  int next_depth = depth;
+  // Nested CALLSEQ_START/END constructs aren't yet legal,
+  // but we can DTRT and handle them correctly here.
   if (Node->getOpcode() == ISD::CALLSEQ_START)
-    next_depth = depth + 1;
-  if (Node->getOpcode() == ISD::CALLSEQ_END) {
-    assert(depth > 0 && "negative depth!");
-    if (depth == 1)
+    depth++;
+  else if (Node->getOpcode() == ISD::CALLSEQ_END) {
+    depth--;
+    if (depth == 0)
       return Node;
-    else
-      next_depth = depth - 1;
   }
   if (Node->use_empty())
     return 0;   // No CallSeqEnd
@@ -266,7 +262,7 @@ static SDNode *FindCallEndFromCallStart(SDNode *Node, int depth = 0) {
     SDNode *User = *UI;
     for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i)
       if (User->getOperand(i) == TheChain)
-        if (SDNode *Result = FindCallEndFromCallStart(User, next_depth))
+        if (SDNode *Result = FindCallEndFromCallStart(User, depth))
           return Result;
   }
   return 0;
@@ -287,7 +283,6 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
     case ISD::CALLSEQ_START:
       if (!nested)
         return Node;
-      Node = Node->getOperand(0).getNode();
       nested--;
       break;
     case ISD::CALLSEQ_END:
@@ -295,7 +290,7 @@ static SDNode *FindCallStartFromCallEnd(SDNode *Node) {
       break;
     }
   }
-  return (Node->getOpcode() == ISD::CALLSEQ_START) ? Node : 0;
+  return 0;
 }
 
 /// LegalizeAllNodesNotLeadingTo - Recursively walk the uses of N, looking to
@@ -365,7 +360,7 @@ static SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP,
         // smaller type.
         TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
         TLI.ShouldShrinkFPConstant(OrigVT)) {
-      const Type *SType = SVT.getTypeForEVT(*DAG.getContext());
+      Type *SType = SVT.getTypeForEVT(*DAG.getContext());
       LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
       VT = SVT;
       Extend = true;
@@ -819,6 +814,11 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
     break;
   }
+  case ISD::ATOMIC_STORE: {
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(2).getValueType());
+    break;
+  }
   case ISD::SELECT_CC:
   case ISD::SETCC:
   case ISD::BR_CC: {
@@ -872,7 +872,8 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Expand;
     break;
-  case ISD::TRAMPOLINE:
+  case ISD::INIT_TRAMPOLINE:
+  case ISD::ADJUST_TRAMPOLINE:
   case ISD::FRAMEADDR:
   case ISD::RETURNADDR:
     // These operations lie about being legal: when they claim to be legal,
@@ -912,12 +913,11 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     case ISD::BR_JT:
     case ISD::BR_CC:
     case ISD::BRCOND:
-      assert(LastCALLSEQ.size() == 1 && "branch inside CALLSEQ_BEGIN/END?");
-      // Branches tweak the chain to include LastCALLSEQ
+      // Branches tweak the chain to include LastCALLSEQ_END
       Ops[0] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ops[0],
-                           getLastCALLSEQ());
+                           LastCALLSEQ_END);
       Ops[0] = LegalizeOp(Ops[0]);
-      setLastCALLSEQ(DAG.getEntryNode());
+      LastCALLSEQ_END = DAG.getEntryNode();
       break;
     case ISD::SHL:
     case ISD::SRL:
@@ -989,6 +989,31 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
 #endif
     assert(0 && "Do not know how to legalize this operator!");
 
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL: {
+    // Scalarize vector SRA/SRL/SHL.
+    EVT VT = Node->getValueType(0);
+    assert(VT.isVector() && "Unable to legalize non-vector shift");
+    assert(TLI.isTypeLegal(VT.getScalarType())&& "Element type must be legal");
+    unsigned NumElem = VT.getVectorNumElements();
+
+    SmallVector<SDValue, 8> Scalars;
+    for (unsigned Idx = 0; Idx < NumElem; Idx++) {
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               VT.getScalarType(),
+                               Node->getOperand(0), DAG.getIntPtrConstant(Idx));
+      SDValue Sh = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               VT.getScalarType(),
+                               Node->getOperand(1), DAG.getIntPtrConstant(Idx));
+      Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
+                                    VT.getScalarType(), Ex, Sh));
+    }
+    Result = DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0),
+                         &Scalars[0], Scalars.size());
+    break;
+  }
+
   case ISD::BUILD_VECTOR:
     switch (TLI.getOperationAction(ISD::BUILD_VECTOR, Node->getValueType(0))) {
     default: assert(0 && "This action is not supported yet!");
@@ -1006,7 +1031,6 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     break;
   case ISD::CALLSEQ_START: {
     SDNode *CallEnd = FindCallEndFromCallStart(Node);
-    assert(CallEnd && "didn't find CALLSEQ_END!");
 
     // Recursively Legalize all of the inputs of the call end that do not lead
     // to this call start.  This ensures that any libcalls that need be inserted
@@ -1023,9 +1047,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
 
     // Merge in the last call to ensure that this call starts after the last
     // call ended.
-    if (getLastCALLSEQ().getOpcode() != ISD::EntryToken) {
+    if (LastCALLSEQ_END.getOpcode() != ISD::EntryToken) {
       Tmp1 = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                         Tmp1, getLastCALLSEQ());
+                         Tmp1, LastCALLSEQ_END);
       Tmp1 = LegalizeOp(Tmp1);
     }
 
@@ -1046,29 +1070,25 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
     // sequence have been legalized, legalize the call itself.  During this
     // process, no libcalls can/will be inserted, guaranteeing that no calls
     // can overlap.
+    assert(!IsLegalizingCall && "Inconsistent sequentialization of calls!");
     // Note that we are selecting this call!
-    setLastCALLSEQ(SDValue(CallEnd, 0));
+    LastCALLSEQ_END = SDValue(CallEnd, 0);
+    IsLegalizingCall = true;
 
     // Legalize the call, starting from the CALLSEQ_END.
-    LegalizeOp(getLastCALLSEQ());
+    LegalizeOp(LastCALLSEQ_END);
+    assert(!IsLegalizingCall && "CALLSEQ_END should have cleared this!");
     return Result;
   }
   case ISD::CALLSEQ_END:
-    {
-      SDNode *myCALLSEQ_BEGIN = FindCallStartFromCallEnd(Node);
-
-      // If the CALLSEQ_START node hasn't been legalized first, legalize it.
-      // This will cause this node to be legalized as well as handling libcalls
-      // right.
-      if (getLastCALLSEQ().getNode() != Node) {
-        LegalizeOp(SDValue(myCALLSEQ_BEGIN, 0));
-        DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
-        assert(I != LegalizedNodes.end() &&
-               "Legalizing the call start should have legalized this node!");
-        return I->second;
-      }
-
-      pushLastCALLSEQ(SDValue(myCALLSEQ_BEGIN, 0));
+    // If the CALLSEQ_START node hasn't been legalized first, legalize it.  This
+    // will cause this node to be legalized as well as handling libcalls right.
+    if (LastCALLSEQ_END.getNode() != Node) {
+      LegalizeOp(SDValue(FindCallStartFromCallEnd(Node), 0));
+      DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+      assert(I != LegalizedNodes.end() &&
+             "Legalizing the call start should have legalized this node!");
+      return I->second;
     }
 
     // Otherwise, the call start has been legalized and everything is going
@@ -1096,8 +1116,9 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
                          Result.getResNo());
       }
     }
+    assert(IsLegalizingCall && "Call sequence imbalance between start/end?");
     // This finishes up call legalization.
-    popLastCALLSEQ();
+    IsLegalizingCall = false;
 
     // If the CALLSEQ_END node has a flag, remember that we legalized it.
     AddLegalizedOperand(SDValue(Node, 0), Result.getValue(0));
@@ -1124,7 +1145,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
         // If this is an unaligned load and the target doesn't support it,
         // expand it.
         if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-          const Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
           unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
           if (LD->getAlignment() < ABIAlignment){
             Result = ExpandUnalignedLoad(cast<LoadSDNode>(Result.getNode()),
@@ -1311,7 +1332,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // If this is an unaligned load and the target doesn't support it,
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-            const Type *Ty =
+            Type *Ty =
               LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment =
               TLI.getTargetData()->getABITypeAlignment(Ty);
@@ -1491,7 +1512,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
@@ -1596,7 +1617,7 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           // If this is an unaligned store and the target doesn't support it,
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            const Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               Result = ExpandUnalignedStore(cast<StoreSDNode>(Result.getNode()),
@@ -1611,82 +1632,101 @@ SDValue SelectionDAGLegalize::LegalizeOp(SDValue Op) {
           EVT WideScalarVT = Tmp3.getValueType().getScalarType();
           EVT NarrowScalarVT = StVT.getScalarType();
 
-          // The Store type is illegal, must scalarize the vector store.
-          SmallVector<SDValue, 8> Stores;
-          bool ScalarLegal = TLI.isTypeLegal(WideScalarVT);
-          if (!TLI.isTypeLegal(StVT) && StVT.isVector() && ScalarLegal) {
+          if (StVT.isVector()) {
             unsigned NumElem = StVT.getVectorNumElements();
+            // The type of the data we want to save
+            EVT RegVT = Tmp3.getValueType();
+            EVT RegSclVT = RegVT.getScalarType();
+            // The type of data as saved in memory.
+            EVT MemSclVT = StVT.getScalarType();
+
+            bool RegScalarLegal = TLI.isTypeLegal(RegSclVT);
+            bool MemScalarLegal = TLI.isTypeLegal(MemSclVT);
+
+            // We need to expand this store. If the register element type
+            // is legal then we can scalarize the vector and use
+            // truncating stores.
+            if (RegScalarLegal) {
+              // Cast floats into integers
+              unsigned ScalarSize = MemSclVT.getSizeInBits();
+              EVT EltVT = EVT::getIntegerVT(*DAG.getContext(), ScalarSize);
+
+              // Round odd types to the next pow of two.
+              if (!isPowerOf2_32(ScalarSize))
+                ScalarSize = NextPowerOf2(ScalarSize);
+
+              // Store Stride in bytes
+              unsigned Stride = ScalarSize/8;
+              // Extract each of the elements from the original vector
+              // and save them into memory individually.
+              SmallVector<SDValue, 8> Stores;
+              for (unsigned Idx = 0; Idx < NumElem; Idx++) {
+                SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                RegSclVT, Tmp3, DAG.getIntPtrConstant(Idx));
+
+                Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                                   DAG.getIntPtrConstant(Stride));
+
+                // This scalar TruncStore may be illegal, but we lehalize it
+                // later.
+                SDValue Store = DAG.getTruncStore(Tmp1, dl, Ex, Tmp2,
+                      ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
+                      isVolatile, isNonTemporal, Alignment);
 
-            unsigned ScalarSize = StVT.getScalarType().getSizeInBits();
-            // Round odd types to the next pow of two.
-            if (!isPowerOf2_32(ScalarSize))
-              ScalarSize = NextPowerOf2(ScalarSize);
-            // Types smaller than 8 bits are promoted to 8 bits.
-            ScalarSize = std::max<unsigned>(ScalarSize, 8);
-            // Store stride
-            unsigned Stride = ScalarSize/8;
-            assert(isPowerOf2_32(Stride) && "Stride must be a power of two");
-
-            for (unsigned Idx=0; Idx<NumElem; Idx++) {
-              SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                                       WideScalarVT, Tmp3, DAG.getIntPtrConstant(Idx));
-
-
-              EVT NVT = EVT::getIntegerVT(*DAG.getContext(), ScalarSize);
+                Stores.push_back(Store);
+              }
 
-              Ex = DAG.getNode(ISD::TRUNCATE, dl, NVT, Ex);
-              Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                                 DAG.getIntPtrConstant(Stride));
-              SDValue Store = DAG.getStore(Tmp1, dl, Ex, Tmp2,
-                                           ST->getPointerInfo().getWithOffset(Idx*Stride),
-                                           isVolatile, isNonTemporal, Alignment);
-              Stores.push_back(Store);
+              Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                   &Stores[0], Stores.size());
+              break;
             }
-            Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                                 &Stores[0], Stores.size());
-            break;
-          }
 
-          // The Store type is illegal, must scalarize the vector store.
-          // However, the scalar type is illegal. Must bitcast the result
-          // and store it in smaller parts.
-          if (!TLI.isTypeLegal(StVT) && StVT.isVector()) {
-            unsigned WideNumElem = StVT.getVectorNumElements();
-            unsigned Stride = NarrowScalarVT.getSizeInBits()/8;
-
-            unsigned SizeRatio =
-              (WideScalarVT.getSizeInBits() / NarrowScalarVT.getSizeInBits());
-
-            EVT CastValueVT = EVT::getVectorVT(*DAG.getContext(), NarrowScalarVT,
-                                               SizeRatio*WideNumElem);
-
-            // Cast the wide elem vector to wider vec with smaller elem type.
-            // Example <2 x i64> -> <4 x i32>
-            Tmp3 = DAG.getNode(ISD::BITCAST, dl, CastValueVT, Tmp3);
-
-            for (unsigned Idx=0; Idx<WideNumElem*SizeRatio; Idx++) {
-              // Extract elment i
-              SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                                       NarrowScalarVT, Tmp3, DAG.getIntPtrConstant(Idx));
-              // bump pointer.
-              Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                                 DAG.getIntPtrConstant(Stride));
-
-              // Store if, this element is:
-              //  - First element on big endian, or
-              //  - Last element on little endian
-              if (( TLI.isBigEndian() && (Idx%SizeRatio == 0)) ||
-                  ((!TLI.isBigEndian() && (Idx%SizeRatio == SizeRatio-1)))) {
-                SDValue Store = DAG.getStore(Tmp1, dl, Ex, Tmp2,
-                                             ST->getPointerInfo().getWithOffset(Idx*Stride),
-                                             isVolatile, isNonTemporal, Alignment);
-                Stores.push_back(Store);
+            // The scalar register type is illegal.
+            // For example saving <2 x i64> -> <2 x i32> on a x86.
+            // In here we bitcast the value into a vector of smaller parts and
+            // save it using smaller scalars.
+            if (!RegScalarLegal && MemScalarLegal) {
+              // Store Stride in bytes
+              unsigned Stride = MemSclVT.getSizeInBits()/8;
+
+              unsigned SizeRatio =
+                (RegSclVT.getSizeInBits() / MemSclVT.getSizeInBits());
+
+              EVT CastValueVT = EVT::getVectorVT(*DAG.getContext(),
+                                                 MemSclVT,
+                                                 SizeRatio * NumElem);
+
+              // Cast the wide elem vector to wider vec with smaller elem type.
+              // Example <2 x i64> -> <4 x i32>
+              Tmp3 = DAG.getNode(ISD::BITCAST, dl, CastValueVT, Tmp3);
+
+              SmallVector<SDValue, 8> Stores;
+              for (unsigned Idx=0; Idx < NumElem * SizeRatio; Idx++) {
+                // Extract the Ith element.
+                SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               NarrowScalarVT, Tmp3, DAG.getIntPtrConstant(Idx));
+                // Bump pointer.
+                Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+                                   DAG.getIntPtrConstant(Stride));
+
+                // Store if, this element is:
+                //  - First element on big endian, or
+                //  - Last element on little endian
+                if (( TLI.isBigEndian() && (Idx % SizeRatio == 0)) ||
+                    ((!TLI.isBigEndian() && (Idx % SizeRatio == SizeRatio-1)))) {
+                  SDValue Store = DAG.getStore(Tmp1, dl, Ex, Tmp2,
+                                  ST->getPointerInfo().getWithOffset(Idx*Stride),
+                                           isVolatile, isNonTemporal, Alignment);
+                  Stores.push_back(Store);
+                }
               }
+              Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                   &Stores[0], Stores.size());
+              break;
             }
-            Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                                 &Stores[0], Stores.size());
-            break;
-          }
+
+            assert(false && "Unable to legalize the vector trunc store!");
+          }// is vector
 
 
           // TRUNCSTORE:i16 i32 -> STORE i16
@@ -1999,7 +2039,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
   unsigned SrcSize = SrcOp.getValueType().getSizeInBits();
   unsigned SlotSize = SlotVT.getSizeInBits();
   unsigned DestSize = DestVT.getSizeInBits();
-  const Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
+  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
   unsigned DestAlign = TLI.getTargetData()->getPrefTypeAlignment(DestType);
 
   // Emit a store to the stack slot.  Use a truncstore if the input value is
@@ -2106,7 +2146,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
         }
       } else {
         assert(Node->getOperand(i).getOpcode() == ISD::UNDEF);
-        const Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
+        Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
         CV.push_back(UndefValue::get(OpNTy));
       }
     }
@@ -2150,6 +2190,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
 // and leave the Hi part unset.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
                                             bool isSigned) {
+  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
   // The input chain to this libcall is the entry node of the function.
   // Legalizing the call will automatically add the previous call to the
   // dependence.
@@ -2159,7 +2200,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Node->getOperand(i).getValueType();
-    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy;
     Entry.isSExt = isSigned;
     Entry.isZExt = !isSigned;
@@ -2169,7 +2210,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
                                          TLI.getPointerTy());
 
   // Splice the libcall in wherever FindInputOutputChains tells us to.
-  const Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   // isTailCall may be true since the callee does not reference caller stack
   // frame. Check if it's in the right position.
@@ -2185,7 +2226,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     return DAG.getRoot();
 
   // Legalize the call sequence, starting with the chain.  This will advance
-  // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that
+  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
   // was added by LowerCallTo (guaranteeing proper serialization of calls).
   LegalizeOp(CallInfo.second);
   return CallInfo.first;
@@ -2210,7 +2251,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy());
 
-  const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   std::pair<SDValue,SDValue> CallInfo =
   TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                   false, 0, TLI.getLibcallCallingConv(LC), false,
@@ -2231,13 +2272,14 @@ std::pair<SDValue, SDValue>
 SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                                          SDNode *Node,
                                          bool isSigned) {
+  assert(!IsLegalizingCall && "Cannot overlap legalization of calls!");
   SDValue InChain = Node->getOperand(0);
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Node->getOperand(i).getValueType();
-    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
     Entry.isSExt = isSigned;
@@ -2248,7 +2290,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   // Splice the libcall in wherever FindInputOutputChains tells us to.
-  const Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
   std::pair<SDValue, SDValue> CallInfo =
     TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
@@ -2256,7 +2298,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
                     Callee, Args, DAG, Node->getDebugLoc());
 
   // Legalize the call sequence, starting with the chain.  This will advance
-  // the LastCALLSEQ to the legalized version of the CALLSEQ_END node that
+  // the LastCALLSEQ_END to the legalized version of the CALLSEQ_END node that
   // was added by LowerCallTo (guaranteeing proper serialization of calls).
   LegalizeOp(CallInfo.second);
   return CallInfo;
@@ -2360,13 +2402,13 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   SDValue InChain = DAG.getEntryNode();
 
   EVT RetVT = Node->getValueType(0);
-  const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Node->getOperand(i).getValueType();
-    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i); Entry.Ty = ArgTy;
     Entry.isSExt = isSigned;
     Entry.isZExt = !isSigned;
@@ -2397,7 +2439,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   LegalizeOp(CallInfo.second);
 
   // Remainder is loaded back from the stack frame.
-  SDValue Rem = DAG.getLoad(RetVT, dl, getLastCALLSEQ(), FIPtr,
+  SDValue Rem = DAG.getLoad(RetVT, dl, LastCALLSEQ_END, FIPtr,
                             MachinePointerInfo(), false, false, 0);
   Results.push_back(CallInfo.first);
   Results.push_back(Rem);
@@ -2955,8 +2997,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     Results.push_back(DAG.getConstant(0, MVT::i32));
     Results.push_back(Node->getOperand(0));
     break;
+  case ISD::ATOMIC_FENCE:
   case ISD::MEMBARRIER: {
     // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    // FIXME: handle "fence singlethread" more efficiently.
     TargetLowering::ArgListTy Args;
     std::pair<SDValue, SDValue> CallResult =
       TLI.LowerCallTo(Node->getOperand(0), Type::getVoidTy(*DAG.getContext()),
@@ -2969,6 +3013,32 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
     Results.push_back(CallResult.second);
     break;
   }
+  case ISD::ATOMIC_LOAD: {
+    // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
+    SDValue Zero = DAG.getConstant(0, Node->getValueType(0));
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Zero, Zero,
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    Results.push_back(Swap.getValue(0));
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
+  case ISD::ATOMIC_STORE: {
+    // There is no libcall for atomic store; fake it with ATOMIC_SWAP.
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
   // By default, atomic intrinsics are marked Legal and lowered. Targets
   // which don't support them directly, however, may want libcalls, in which
   // case they mark them Expand, and we get here.
@@ -3727,8 +3797,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node,
 
     LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp2.getValueType()),
                           Tmp2, Tmp3, Tmp4, dl);
-    assert(LastCALLSEQ.size() == 1 && "branch inside CALLSEQ_BEGIN/END?");
-    setLastCALLSEQ(DAG.getEntryNode());
+    LastCALLSEQ_END = DAG.getEntryNode();
 
     assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
     Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index e6835d8..7c1cc69 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -55,6 +55,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
+    case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
     case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
     case ISD::ConstantFP:
@@ -107,6 +108,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
   return BitConvertToInteger(N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
+                                                      unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return BitConvertToInteger(Op);
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
   // Convert the inputs to integers, and build a new pair out of them.
   return DAG.getNode(ISD::BUILD_PAIR, N->getDebugLoc(),
@@ -827,11 +834,11 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to expand the result of this operator!");
 
-  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
+  case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
   case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
   case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
@@ -879,10 +886,10 @@ void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
   assert(NVT.getSizeInBits() == integerPartWidth &&
          "Do not know how to expand this float constant!");
   APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
-  Lo = DAG.getConstantFP(APFloat(APInt(integerPartWidth, 1,
-                                       &C.getRawData()[1])), NVT);
-  Hi = DAG.getConstantFP(APFloat(APInt(integerPartWidth, 1,
-                                       &C.getRawData()[0])), NVT);
+  Lo = DAG.getConstantFP(APFloat(APInt(integerPartWidth, C.getRawData()[1])),
+                         NVT);
+  Hi = DAG.getConstantFP(APFloat(APInt(integerPartWidth, C.getRawData()[0])),
+                         NVT);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
@@ -1201,7 +1208,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
   static const uint64_t TwoE32[]  = { 0x41f0000000000000LL, 0 };
   static const uint64_t TwoE64[]  = { 0x43f0000000000000LL, 0 };
   static const uint64_t TwoE128[] = { 0x47f0000000000000LL, 0 };
-  const uint64_t *Parts = 0;
+  ArrayRef<uint64_t> Parts;
 
   switch (SrcVT.getSimpleVT().SimpleTy) {
   default:
@@ -1218,7 +1225,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
   }
 
   Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
-                   DAG.getConstantFP(APFloat(APInt(128, 2, Parts)),
+                   DAG.getConstantFP(APFloat(APInt(128, Parts)),
                                      MVT::ppcf128));
   Lo = DAG.getNode(ISD::SELECT_CC, dl, VT, Src, DAG.getConstant(0, SrcVT),
                    Lo, Hi, DAG.getCondCode(ISD::SETLT));
@@ -1291,8 +1298,7 @@ void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
   GetExpandedFloat(NewLHS, LHSLo, LHSHi);
   GetExpandedFloat(NewRHS, RHSLo, RHSHi);
 
-  EVT VT = NewLHS.getValueType();
-  assert(VT == MVT::ppcf128 && "Unsupported setcc type!");
+  assert(NewLHS.getValueType() == MVT::ppcf128 && "Unsupported setcc type!");
 
   // FIXME:  This generated code sucks.  We want to generate
   //         FCMPU crN, hi1, hi2
@@ -1373,7 +1379,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
     assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
            "Logic only correct for ppcf128!");
     const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
-    APFloat APF = APFloat(APInt(128, 2, TwoE31));
+    APFloat APF = APFloat(APInt(128, TwoE31));
     SDValue Tmp = DAG.getConstantFP(APF, MVT::ppcf128);
     //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
     // FIXME: generated code sucks.
@@ -1445,6 +1451,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
                                      ST->getValue().getValueType());
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   assert(ST->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+  (void)NVT;
 
   SDValue Lo, Hi;
   GetExpandedOp(ST->getValue(), Lo, Hi);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e7c77dd..a5c4c2d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -48,6 +48,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     N->dump(&DAG); dbgs() << "\n";
 #endif
     llvm_unreachable("Do not know how to promote this operator!");
+  case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break;
   case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
   case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
   case ISD::BITCAST:     Res = PromoteIntRes_BITCAST(N); break;
@@ -63,6 +64,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
   case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
+  case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
   case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
   case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
@@ -84,6 +86,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
                          Res = PromoteIntRes_BUILD_VECTOR(N); break;
   case ISD::SCALAR_TO_VECTOR:
                          Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:
+                         Res = PromoteIntRes_CONCAT_VECTORS(N); break;
 
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
@@ -114,6 +118,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SMULO:
   case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
 
+  case ISD::ATOMIC_LOAD:
+    Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
+
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_AND:
@@ -136,6 +143,12 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     SetPromotedInteger(SDValue(N, ResNo), Res);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
+                                                     unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetPromotedInteger(Op);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
   // Sign-extend the new bits, and continue the assertion.
   SDValue Op = SExtPromotedInteger(N->getOperand(0));
@@ -150,12 +163,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
+  EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+                              N->getMemoryVT(), ResVT,
+                              N->getChain(), N->getBasePtr(),
+                              N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
   SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
                               N->getMemoryVT(),
                               N->getChain(), N->getBasePtr(),
-                              Op2, N->getMemOperand());
+                              Op2, N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -167,7 +194,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic2(AtomicSDNode *N) {
   SDValue Op3 = GetPromotedInteger(N->getOperand(3));
   SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
                               N->getMemoryVT(), N->getChain(), N->getBasePtr(),
-                              Op2, Op3, N->getMemOperand());
+                              Op2, Op3, N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -457,6 +485,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
                      LHS.getValueType(), N->getOperand(0),LHS,RHS);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
+  SDValue Mask = GetPromotedInteger(N->getOperand(0));
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+  return DAG.getNode(ISD::VSELECT, N->getDebugLoc(),
+                     LHS.getValueType(), Mask, LHS, RHS);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(2));
   SDValue RHS = GetPromotedInteger(N->getOperand(3));
@@ -467,16 +503,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   EVT SVT = TLI.getSetCCResultType(N->getOperand(0).getValueType());
-  assert(isTypeLegal(SVT) && "Illegal SetCC type!");
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  // Only use the result of getSetCCResultType if it is legal,
+  // otherwise just use the promoted result type (NVT).
+  if (!TLI.isTypeLegal(SVT))
+      SVT = NVT;
+
   DebugLoc dl = N->getDebugLoc();
+  assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() &&
+         "Vector compare must return a vector result!");
 
   // Get the SETCC result using the canonical SETCC type.
-  SDValue SetCC = DAG.getNode(ISD::SETCC, dl, SVT, N->getOperand(0),
+  SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, N->getOperand(0),
                               N->getOperand(1), N->getOperand(2));
 
-  // Convert to the expected type.
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
+  // Convert to the expected type.
   return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC);
 }
 
@@ -707,6 +751,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
     llvm_unreachable("Do not know how to promote this operator's operand!");
 
   case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
+  case ISD::ATOMIC_STORE:
+    Res = PromoteIntOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
+    break;
   case ISD::BITCAST:      Res = PromoteIntOp_BITCAST(N); break;
   case ISD::BR_CC:        Res = PromoteIntOp_BR_CC(N, OpNo); break;
   case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
@@ -721,6 +768,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::MEMBARRIER:   Res = PromoteIntOp_MEMBARRIER(N); break;
   case ISD::SCALAR_TO_VECTOR:
                           Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
   case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
@@ -791,6 +839,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
   return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0), Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  return DAG.getAtomic(N->getOpcode(), N->getDebugLoc(), N->getMemoryVT(),
+                       N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(),
+                       N->getOrdering(), N->getSynchScope());
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
   // This should only occur in unusual situations like bitcasting to an
   // x86_fp80, so just turn it into a store+load
@@ -913,14 +968,17 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
-  assert(OpNo == 0 && "Only know how to promote condition");
+  assert(OpNo == 0 && "Only know how to promote the condition!");
+  SDValue Cond = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
 
   // Promote all the way up to the canonical SetCC type.
-  EVT SVT = TLI.getSetCCResultType(N->getOperand(1).getValueType());
-  SDValue Cond = PromoteTargetBoolean(N->getOperand(0), SVT);
+  EVT SVT = TLI.getSetCCResultType(N->getOpcode() == ISD::SELECT ?
+                                   OpTy.getScalarType() : OpTy);
+  Cond = PromoteTargetBoolean(Cond, SVT);
 
-  return SDValue(DAG.UpdateNodeOperands(N, Cond,
-                                N->getOperand(1), N->getOperand(2)), 0);
+  return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1),
+                                        N->getOperand(2)), 0);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
@@ -1024,7 +1082,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to expand the result of this operator!");
 
-  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
@@ -1055,6 +1113,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
   case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
   case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break;
 
   case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_SUB:
@@ -1546,6 +1605,12 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
   ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
 }
 
+void DAGTypeLegalizer::ExpandIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                                 SDValue &Lo, SDValue &Hi) {
+  SDValue Res = DisintegrateMERGE_VALUES(N, ResNo);
+  SplitInteger(Res, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
@@ -2176,9 +2241,9 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  const Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
   EVT PtrVT = TLI.getPointerTy();
-  const Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
+  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
   DebugLoc dl = N->getDebugLoc();
 
   // A divide for UMULO should be faster than a function call.
@@ -2222,7 +2287,7 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     EVT ArgVT = N->getOperand(i).getValueType();
-    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = N->getOperand(i);
     Entry.Ty = ArgTy;
     Entry.isSExt = true;
@@ -2321,6 +2386,20 @@ void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
   }
 }
 
+void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+                               N->getOperand(0),
+                               N->getOperand(1), Zero, Zero,
+                               cast<AtomicSDNode>(N)->getMemOperand(),
+                               cast<AtomicSDNode>(N)->getOrdering(),
+                               cast<AtomicSDNode>(N)->getSynchScope());
+  ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
+  ReplaceValueWith(SDValue(N, 1), Swap.getValue(1));
+}
 
 //===----------------------------------------------------------------------===//
 //  Integer Operand Expansion
@@ -2365,6 +2444,8 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
   case ISD::RETURNADDR:
   case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
+
+  case ISD::ATOMIC_STORE:      Res = ExpandIntOp_ATOMIC_STORE(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2742,6 +2823,19 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   return MakeLibCall(LC, DstVT, &Op, 1, true, dl);
 }
 
+SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                               cast<AtomicSDNode>(N)->getMemoryVT(),
+                               N->getOperand(0),
+                               N->getOperand(1), N->getOperand(2),
+                               cast<AtomicSDNode>(N)->getMemOperand(),
+                               cast<AtomicSDNode>(N)->getOrdering(),
+                               cast<AtomicSDNode>(N)->getSynchScope());
+  return Swap.getValue(1);
+}
+
+
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue InOp0 = N->getOperand(0);
   EVT InVT = InOp0.getValueType();
@@ -2775,7 +2869,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
 
 
 SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
-
   ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
   EVT VT = N->getValueType(0);
   DebugLoc dl = N->getDebugLoc();
@@ -2830,6 +2923,46 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Op0 = N->getOperand(1);
+  SDValue Op1 = N->getOperand(1);
+  assert(Op0.getValueType() == Op1.getValueType() &&
+         "Invalid input vector types");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT OutElemTy = NOutVT.getVectorElementType();
+
+  unsigned NumElem0 = Op0.getValueType().getVectorNumElements();
+  unsigned NumElem1 = Op1.getValueType().getVectorNumElements();
+  unsigned NumOutElem = NOutVT.getVectorNumElements();
+  assert(NumElem0 + NumElem1 == NumOutElem &&
+         "Invalid number of incoming elements");
+
+  // Take the elements from the first vector.
+  SmallVector<SDValue, 8> Ops(NumOutElem);
+  for (unsigned i = 0; i < NumElem0; ++i) {
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              Op0.getValueType().getScalarType(), Op0,
+                              DAG.getIntPtrConstant(i));
+    Ops[i] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
+  }
+
+  // Take the elements from the second vector
+  for (unsigned i = 0; i < NumElem1; ++i) {
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              Op1.getValueType().getScalarType(), Op1,
+                              DAG.getIntPtrConstant(i));
+    Ops[i + NumElem0] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
@@ -2838,14 +2971,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
   DebugLoc dl = N->getDebugLoc();
-
-  SDValue ConvertedVector = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
-                                        N->getOperand(0));
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
 
   SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
     NOutVTElem, N->getOperand(1));
-  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,NOutVT,
-    ConvertedVector, ConvElem, N->getOperand(2));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NOutVT,
+    V0, ConvElem, N->getOperand(2));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
@@ -2855,20 +2986,23 @@ SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
     V0->getValueType(0).getScalarType(), V0, V1);
 
-  return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext);
-
+  // EXTRACT_VECTOR_ELT can return types which are wider than the incoming
+  // element types. If this is the case then we need to expand the outgoing
+  // value and not truncate it.
+  return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
-
   DebugLoc dl = N->getDebugLoc();
+  unsigned NumElems = N->getNumOperands();
 
   EVT RetSclrTy = N->getValueType(0).getVectorElementType();
 
   SmallVector<SDValue, 8> NewOps;
+  NewOps.reserve(NumElems);
 
   // For each incoming vector
-  for (unsigned VecIdx = 0, E = N->getNumOperands(); VecIdx!= E; ++VecIdx) {
+  for (unsigned VecIdx = 0; VecIdx != NumElems; ++VecIdx) {
     SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx));
     EVT SclrTy = Incoming->getValueType(0).getVectorElementType();
     unsigned NumElem = Incoming->getValueType(0).getVectorNumElements();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index ba658b0..a4bb577 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -946,6 +946,13 @@ bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
   return true;
 }
 
+SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    if (i != ResNo)
+      ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+  return SDValue(N, ResNo);
+}
+
 /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
 /// which is split into two not necessarily identical pieces.
 void DAGTypeLegalizer::GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT) {
@@ -1046,7 +1053,7 @@ SDValue DAGTypeLegalizer::MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy());
 
-  const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   std::pair<SDValue,SDValue> CallInfo =
     TLI.LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                     false, 0, TLI.getLibcallCallingConv(LC), false,
@@ -1067,7 +1074,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
     EVT ArgVT = Node->getOperand(i).getValueType();
-    const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
     Entry.isSExt = isSigned;
@@ -1078,7 +1085,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
                                          TLI.getPointerTy());
 
   // Splice the libcall in wherever FindInputOutputChains tells us to.
-  const Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
   std::pair<SDValue, SDValue> CallInfo =
     TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
@@ -1093,24 +1100,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
 /// type i1, the bits of which conform to getBooleanContents.
 SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT VT) {
   DebugLoc dl = Bool.getDebugLoc();
-  ISD::NodeType ExtendCode;
-  switch (TLI.getBooleanContents()) {
-  default:
-    assert(false && "Unknown BooleanContent!");
-  case TargetLowering::UndefinedBooleanContent:
-    // Extend to VT by adding rubbish bits.
-    ExtendCode = ISD::ANY_EXTEND;
-    break;
-  case TargetLowering::ZeroOrOneBooleanContent:
-    // Extend to VT by adding zero bits.
-    ExtendCode = ISD::ZERO_EXTEND;
-    break;
-  case TargetLowering::ZeroOrNegativeOneBooleanContent: {
-    // Extend to VT by copying the sign bit.
-    ExtendCode = ISD::SIGN_EXTEND;
-    break;
-  }
-  }
+  ISD::NodeType ExtendCode =
+    TargetLowering::getExtendForContent(TLI.getBooleanContents(VT.isVector()));
   return DAG.getNode(ExtendCode, dl, VT, Bool);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 952797d..abacdac 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -148,15 +148,22 @@ private:
   SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
   bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
   bool CustomWidenLowerNode(SDNode *N, EVT VT);
+
+  /// DisintegrateMERGE_VALUES - Replace each result of the given MERGE_VALUES
+  /// node with the corresponding input operand, except for the result 'ResNo',
+  /// which is returned.
+  SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
+
   SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
   SDValue JoinIntegers(SDValue Lo, SDValue Hi);
   SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
   SDValue MakeLibCall(RTLIB::Libcall LC, EVT RetVT,
                       const SDValue *Ops, unsigned NumOps, bool isSigned,
                       DebugLoc dl);
-	std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
-									                               SDNode *Node, bool isSigned);
-	std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+  
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
 
   SDValue PromoteTargetBoolean(SDValue Bool, EVT VT);
   void ReplaceValueWith(SDValue From, SDValue To);
@@ -206,8 +213,10 @@ private:
 
   // Integer Result Promotion.
   void PromoteIntegerResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_AssertSext(SDNode *N);
   SDValue PromoteIntRes_AssertZext(SDNode *N);
+  SDValue PromoteIntRes_Atomic0(AtomicSDNode *N);
   SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
   SDValue PromoteIntRes_Atomic2(AtomicSDNode *N);
   SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
@@ -215,6 +224,7 @@ private:
   SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
   SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntRes_BITCAST(SDNode *N);
   SDValue PromoteIntRes_BSWAP(SDNode *N);
   SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
@@ -232,6 +242,7 @@ private:
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_SDIV(SDNode *N);
   SDValue PromoteIntRes_SELECT(SDNode *N);
+  SDValue PromoteIntRes_VSELECT(SDNode *N);
   SDValue PromoteIntRes_SELECT_CC(SDNode *N);
   SDValue PromoteIntRes_SETCC(SDNode *N);
   SDValue PromoteIntRes_SHL(SDNode *N);
@@ -249,6 +260,7 @@ private:
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo);
   SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N);
   SDValue PromoteIntOp_BITCAST(SDNode *N);
   SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
   SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
@@ -264,6 +276,7 @@ private:
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VSETCC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_Shift(SDNode *N);
   SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
   SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
@@ -289,6 +302,8 @@ private:
 
   // Integer Result Expansion.
   void ExpandIntegerResult(SDNode *N, unsigned ResNo);
+  void ExpandIntRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
+                                       SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -320,6 +335,8 @@ private:
   void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
   void ExpandShiftByConstant(SDNode *N, unsigned Amt,
                              SDValue &Lo, SDValue &Hi);
   bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -339,6 +356,7 @@ private:
   SDValue ExpandIntOp_TRUNCATE(SDNode *N);
   SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
   SDValue ExpandIntOp_RETURNADDR(SDNode *N);
+  SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
                                   ISD::CondCode &CCCode, DebugLoc dl);
@@ -362,6 +380,7 @@ private:
 
   // Result Float to Integer Conversion.
   void SoftenFloatResult(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BITCAST(SDNode *N);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
   SDValue SoftenFloatRes_ConstantFP(ConstantFPSDNode *N);
@@ -488,6 +507,7 @@ private:
 
   // Vector Result Scalarization: <1 x ty> -> ty.
   void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue ScalarizeVecRes_BinOp(SDNode *N);
   SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
@@ -559,6 +579,7 @@ private:
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue SplitVecOp_VSETCC(SDNode *N);
   SDValue SplitVecOp_FP_ROUND(SDNode *N);
 
   //===--------------------------------------------------------------------===//
@@ -581,6 +602,7 @@ private:
 
   // Widen Vector Result Promotion.
   void WidenVectorResult(SDNode *N, unsigned ResNo);
+  SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo);
   SDValue WidenVecRes_BITCAST(SDNode* N);
   SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
   SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
@@ -677,7 +699,8 @@ private:
   void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
 
   // Generic Result Splitting.
-  void SplitRes_MERGE_VALUES(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                             SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -699,6 +722,8 @@ private:
   }
 
   // Generic Result Expansion.
+  void ExpandRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
+                                    SDValue &Lo, SDValue &Hi);
   void ExpandRes_BITCAST           (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandRes_BUILD_PAIR        (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandRes_EXTRACT_ELEMENT   (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 85ea6b6..8e7e498 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -31,6 +31,11 @@ using namespace llvm;
 // These routines assume that the Lo/Hi part is stored first in memory on
 // little/big-endian machines, followed by the Hi/Lo part.  This means that
 // they cannot be used as is on vectors, for which Lo is always stored first.
+void DAGTypeLegalizer::ExpandRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                              SDValue &Lo, SDValue &Hi) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetExpandedOp(Op, Lo, Hi);
+}
 
 void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   EVT OutVT = N->getValueType(0);
@@ -426,37 +431,34 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
 // bytes; for integers and floats it is Lo first if and only if the machine is
 // little-endian).
 
-void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N,
+void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
                                              SDValue &Lo, SDValue &Hi) {
-  // A MERGE_VALUES node can produce any number of values.  We know that the
-  // first illegal one needs to be expanded into Lo/Hi.
-  unsigned i;
-
-  // The string of legal results gets turned into input operands, which have
-  // the same type.
-  for (i = 0; isTypeLegal(N->getValueType(i)); ++i)
-    ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
-
-  // The first illegal result must be the one that needs to be expanded.
-  GetSplitOp(N->getOperand(i), Lo, Hi);
-
-  // Legalize the rest of the results into the input operands whether they are
-  // legal or not.
-  unsigned e = N->getNumValues();
-  for (++i; i != e; ++i)
-    ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetSplitOp(Op, Lo, Hi);
 }
 
 void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
                                        SDValue &Hi) {
-  SDValue LL, LH, RL, RH;
+  SDValue LL, LH, RL, RH, CL, CH;
   DebugLoc dl = N->getDebugLoc();
   GetSplitOp(N->getOperand(1), LL, LH);
   GetSplitOp(N->getOperand(2), RL, RH);
 
   SDValue Cond = N->getOperand(0);
-  Lo = DAG.getNode(ISD::SELECT, dl, LL.getValueType(), Cond, LL, RL);
-  Hi = DAG.getNode(ISD::SELECT, dl, LH.getValueType(), Cond, LH, RH);
+  CL = CH = Cond;
+  if (Cond.getValueType().isVector()) {
+    assert(Cond.getValueType().getVectorElementType() == MVT::i1 &&
+           "Condition legalized before result?");
+    unsigned NumElements = Cond.getValueType().getVectorNumElements();
+    EVT VCondTy = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElements / 2);
+    CL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
+                     DAG.getIntPtrConstant(0));
+    CH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
+                     DAG.getIntPtrConstant(NumElements / 2));
+  }
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
 }
 
 void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ffff10c..f815b00 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -61,6 +61,9 @@ class VectorLegalizer {
   // Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
   // SINT_TO_FLOAT and SHR on vectors isn't legal.
   SDValue ExpandUINT_TO_FLOAT(SDValue Op);
+  // Implement vselect in terms of XOR, AND, OR when blend is not supported
+  // by the target.
+  SDValue ExpandVSELECT(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
   // Implements vector promotion; this is essentially just bitcasting the
   // operands to a different type and bitcasting the result back to the
@@ -157,8 +160,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::CTLZ:
   case ISD::CTPOP:
   case ISD::SELECT:
+  case ISD::VSELECT:
   case ISD::SELECT_CC:
-  case ISD::VSETCC:
+  case ISD::SETCC:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:
   case ISD::TRUNCATE:
@@ -210,11 +214,13 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     // FALL THROUGH
   }
   case TargetLowering::Expand:
-    if (Node->getOpcode() == ISD::UINT_TO_FP)
+    if (Node->getOpcode() == ISD::VSELECT)
+      Result = ExpandVSELECT(Op);
+    else if (Node->getOpcode() == ISD::UINT_TO_FP)
       Result = ExpandUINT_TO_FLOAT(Op);
     else if (Node->getOpcode() == ISD::FNEG)
       Result = ExpandFNEG(Op);
-    else if (Node->getOpcode() == ISD::VSETCC)
+    else if (Node->getOpcode() == ISD::SETCC)
       Result = UnrollVSETCC(Op);
     else
       Result = DAG.UnrollVectorOp(Op.getNode());
@@ -256,9 +262,41 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
   return DAG.getNode(ISD::BITCAST, dl, VT, Op);
 }
 
-SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
+SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
+  // Implement VSELECT in terms of XOR, AND, OR
+  // on platforms which do not support blend natively.
+  EVT VT =  Op.getOperand(0).getValueType();
+  DebugLoc DL = Op.getDebugLoc();
 
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  if (!TLI.isOperationLegalOrCustom(ISD::AND, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::XOR, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::OR, VT))
+        return DAG.UnrollVectorOp(Op.getNode());
+
+  assert(VT.getSizeInBits() == Op.getOperand(1).getValueType().getSizeInBits()
+         && "Invalid mask size");
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, VT, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, VT, Op2);
+
+  SDValue AllOnes = DAG.getConstant(
+    APInt::getAllOnesValue(VT.getScalarType().getSizeInBits()), VT);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask);
+  return DAG.getNode(ISD::OR, DL, VT, Op1, Op2);
+}
 
+SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
   EVT VT = Op.getOperand(0).getValueType();
   DebugLoc DL = Op.getDebugLoc();
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index b5698f9..107a42b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -44,8 +44,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
-    llvm_unreachable("Do not know how to scalarize the result of this operator!");
+    report_fatal_error("Do not know how to scalarize the result of this "
+                       "operator!\n");
 
+  case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      R = N->getOperand(0); break;
   case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
@@ -62,8 +64,6 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
-  case ISD::VSETCC:            R = ScalarizeVecRes_VSETCC(N); break;
-
   case ISD::ANY_EXTEND:
   case ISD::CTLZ:
   case ISD::CTPOP:
@@ -129,6 +129,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
                      LHS.getValueType(), LHS, RHS);
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
+                                                       unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetScalarizedVector(Op);
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
   return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
@@ -237,6 +243,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() ==
+         N->getOperand(0).getValueType().isVector() &&
+         "Scalar/Vector type mismatch");
+
+  if (N->getValueType(0).isVector()) return ScalarizeVecRes_VSETCC(N);
+
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
   DebugLoc DL = N->getDebugLoc();
@@ -259,35 +271,23 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
   EVT NVT = N->getValueType(0).getVectorElementType();
-  EVT SVT = TLI.getSetCCResultType(LHS.getValueType());
   DebugLoc DL = N->getDebugLoc();
 
   // Turn it into a scalar SETCC.
-  SDValue Res = DAG.getNode(ISD::SETCC, DL, SVT, LHS, RHS, N->getOperand(2));
-
-  // VSETCC always returns a sign-extended value, while SETCC may not.  The
-  // SETCC result type may not match the vector element type.  Correct these.
-  if (NVT.bitsLE(SVT)) {
-    // The SETCC result type is bigger than the vector element type.
-    // Ensure the SETCC result is sign-extended.
-    if (TLI.getBooleanContents() !=
-        TargetLowering::ZeroOrNegativeOneBooleanContent)
-      Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SVT, Res,
-                        DAG.getValueType(MVT::i1));
-    // Truncate to the final type.
-    return DAG.getNode(ISD::TRUNCATE, DL, NVT, Res);
-  }
-
-  // The SETCC result type is smaller than the vector element type.
-  // If the SetCC result is not sign-extended, chop it down to MVT::i1.
-  if (TLI.getBooleanContents() !=
-        TargetLowering::ZeroOrNegativeOneBooleanContent)
-    Res = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Res);
-  // Sign extend to the final type.
-  return DAG.getNode(ISD::SIGN_EXTEND, DL, NVT, Res);
+  SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
+                            N->getOperand(2));
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+    TargetLowering::getExtendForContent(TLI.getBooleanContents(true));
+  return DAG.getNode(ExtendCode, DL, NVT, Res);
 }
 
 
@@ -415,7 +415,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to split the result of this operator!");
 
-  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, Lo, Hi); break;
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::VSELECT:
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
@@ -432,7 +433,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
   case ISD::SETCC:
-  case ISD::VSETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
     break;
   case ISD::VECTOR_SHUFFLE:
@@ -524,12 +524,11 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
 
   // Handle some special cases efficiently.
   switch (getTypeAction(InVT)) {
-  default:
-    assert(false && "Unknown type action!");
   case TargetLowering::TypeLegal:
   case TargetLowering::TypePromoteInteger:
   case TargetLowering::TypeSoftenFloat:
   case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeWidenVector:
     break;
   case TargetLowering::TypeExpandInteger:
   case TargetLowering::TypeExpandFloat:
@@ -670,7 +669,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   // Store the new element.  This may be larger than the vector element type,
   // so use a truncating store.
   SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
-  const Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
+  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment =
     TLI.getTargetData()->getPrefTypeAlignment(VecType);
   Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT,
@@ -740,6 +739,10 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
 }
 
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+
   EVT LoVT, HiVT;
   DebugLoc DL = N->getDebugLoc();
   GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
@@ -965,7 +968,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       dbgs() << "\n";
 #endif
       llvm_unreachable("Do not know how to split this operator's operand!");
-
+    case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
     case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
     case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
     case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
@@ -1163,6 +1166,26 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
                      &Elts[0], Elts.size());
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  // The result has a legal vector type, but the input needs splitting.
+  SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
+  DebugLoc DL = N->getDebugLoc();
+  GetSplitVector(N->getOperand(0), Lo0, Hi0);
+  GetSplitVector(N->getOperand(1), Lo1, Hi1);
+  unsigned PartElements = Lo0.getValueType().getVectorNumElements();
+  EVT PartResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, PartElements);
+  EVT WideResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 2*PartElements);
+
+  LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
+  HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
+  return PromoteTargetBoolean(Con, N->getValueType(0));
+}
+
+
 SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
@@ -1205,6 +1228,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to widen the result of this operator!");
 
+  case ISD::MERGE_VALUES:      Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
   case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
@@ -1222,10 +1246,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
-  case ISD::VSETCC:
-    Res = WidenVecRes_VSETCC(N);
-    break;
-
   case ISD::ADD:
   case ISD::AND:
   case ISD::BSWAP:
@@ -1557,6 +1577,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
                      WidenVT, WidenLHS, DAG.getValueType(ExtVT));
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
+  SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetWidenedVector(WidenVec);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   SDValue InOp = N->getOperand(0);
   EVT InVT = InOp.getValueType();
@@ -1661,6 +1686,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   DebugLoc dl = N->getDebugLoc();
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumInElts = InVT.getVectorNumElements();
   unsigned NumOperands = N->getNumOperands();
 
   bool InputWidened = false; // Indicates we need to widen the input.
@@ -1686,17 +1712,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
         if (N->getOperand(i).getOpcode() != ISD::UNDEF)
           break;
 
-      if (i > NumOperands)
+      if (i == NumOperands)
         // Everything but the first operand is an UNDEF so just return the
         // widened first operand.
         return GetWidenedVector(N->getOperand(0));
 
       if (NumOperands == 2) {
         // Replace concat of two operands with a shuffle.
-        SmallVector<int, 16> MaskOps(WidenNumElts);
-        for (unsigned i=0; i < WidenNumElts/2; ++i) {
+        SmallVector<int, 16> MaskOps(WidenNumElts, -1);
+        for (unsigned i = 0; i < NumInElts; ++i) {
           MaskOps[i] = i;
-          MaskOps[i+WidenNumElts/2] = i+WidenNumElts;
+          MaskOps[i + NumInElts] = i + WidenNumElts;
         }
         return DAG.getVectorShuffle(WidenVT, dl,
                                     GetWidenedVector(N->getOperand(0)),
@@ -1708,7 +1734,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
 
   // Fall back to use extracts and build vector.
   EVT EltVT = WidenVT.getVectorElementType();
-  unsigned NumInElts = InVT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(WidenNumElts);
   unsigned Idx = 0;
   for (unsigned i=0; i < NumOperands; ++i) {
@@ -1916,6 +1941,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() ==
+         N->getOperand(0).getValueType().isVector() &&
+         "Scalar/Vector type mismatch");
+  if (N->getValueType(0).isVector()) return WidenVecRes_VSETCC(N);
+
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
@@ -1954,6 +1984,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operands must be vectors");
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
@@ -1970,7 +2003,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
   assert(InOp1.getValueType() == WidenInVT &&
          InOp2.getValueType() == WidenInVT &&
          "Input not widened to expected type!");
-  return DAG.getNode(ISD::VSETCC, N->getDebugLoc(),
+  (void)WidenInVT;
+  return DAG.getNode(ISD::SETCC, N->getDebugLoc(),
                      WidenVT, InOp1, InOp2, N->getOperand(2));
 }
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 12b1838..e757def 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -2621,6 +2621,39 @@ bool RegReductionPQBase::canClobber(const SUnit *SU, const SUnit *Op) {
   return false;
 }
 
+/// canClobberReachingPhysRegUse - True if SU would clobber one of it's
+/// successor's explicit physregs whose definition can reach DepSU.
+/// i.e. DepSU should not be scheduled above SU.
+static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
+                                         ScheduleDAGRRList *scheduleDAG,
+                                         const TargetInstrInfo *TII,
+                                         const TargetRegisterInfo *TRI) {
+  const unsigned *ImpDefs
+    = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
+  if(!ImpDefs)
+    return false;
+
+  for (SUnit::const_succ_iterator SI = SU->Succs.begin(), SE = SU->Succs.end();
+       SI != SE; ++SI) {
+    SUnit *SuccSU = SI->getSUnit();
+    for (SUnit::const_pred_iterator PI = SuccSU->Preds.begin(),
+           PE = SuccSU->Preds.end(); PI != PE; ++PI) {
+      if (!PI->isAssignedRegDep())
+        continue;
+
+      for (const unsigned *ImpDef = ImpDefs; *ImpDef; ++ImpDef) {
+        // Return true if SU clobbers this physical register use and the
+        // definition of the register reaches from DepSU. IsReachable queries a
+        // topological forward sort of the DAG (following the successors).
+        if (TRI->regsOverlap(*ImpDef, PI->getReg()) &&
+            scheduleDAG->IsReachable(DepSU, PI->getSUnit()))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
 /// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
 /// physical register defs.
 static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
@@ -2837,7 +2870,8 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
             SuccOpc == TargetOpcode::INSERT_SUBREG ||
             SuccOpc == TargetOpcode::SUBREG_TO_REG)
           continue;
-        if ((!canClobber(SuccSU, DUSU) ||
+        if (!canClobberReachingPhysRegUse(SuccSU, SU, scheduleDAG, TII, TRI) &&
+            (!canClobber(SuccSU, DUSU) ||
              (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
              (!SU->isCommutable && SuccSU->isCommutable)) &&
             !scheduleDAG->IsReachable(SuccSU, SU)) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 35ea0bb..20bea8e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -403,7 +403,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(CP->getAlignment());
     ID.AddInteger(CP->getOffset());
     if (CP->isMachineConstantPoolEntry())
-      CP->getMachineCPVal()->AddSelectionDAGCSEId(ID);
+      CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
     else
       ID.AddPointer(CP->getConstVal());
     ID.AddInteger(CP->getTargetFlags());
@@ -432,7 +432,9 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   case ISD::ATOMIC_LOAD_MIN:
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
-  case ISD::ATOMIC_LOAD_UMAX: {
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE: {
     const AtomicSDNode *AT = cast<AtomicSDNode>(N);
     ID.AddInteger(AT->getMemoryVT().getRawBits());
     ID.AddInteger(AT->getRawSubclassData());
@@ -769,11 +771,14 @@ static void VerifyNodeCommon(SDNode *N) {
     assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
            "Wrong number of operands!");
     EVT EltVT = N->getValueType(0).getVectorElementType();
-    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I)
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
       assert((I->getValueType() == EltVT ||
              (EltVT.isInteger() && I->getValueType().isInteger() &&
               EltVT.bitsLE(I->getValueType()))) &&
             "Wrong operand type!");
+      assert(I->getValueType() == N->getOperand(0).getValueType() &&
+             "Operands must all have the same type");
+    }
     break;
   }
   }
@@ -821,7 +826,7 @@ static void VerifyMachineNode(SDNode *N) {
 /// given type.
 ///
 unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
-  const Type *Ty = VT == MVT::iPTR ?
+  Type *Ty = VT == MVT::iPTR ?
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
@@ -876,6 +881,12 @@ void SelectionDAG::clear() {
   DbgInfo->clear();
 }
 
+SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ANY_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
 SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
@@ -925,13 +936,25 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
   assert(VT.isInteger() && "Cannot create FP integer constant!");
 
   EVT EltVT = VT.getScalarType();
-  assert(Val.getBitWidth() == EltVT.getSizeInBits() &&
-         "APInt size does not match type size!");
+  const ConstantInt *Elt = &Val;
 
+  // In some cases the vector type is legal but the element type is illegal and
+  // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
+  // inserted value (the type does not need to match the vector element type).
+  // Any extra bits introduced will be truncated away.
+  if (VT.isVector() && TLI.getTypeAction(*getContext(), EltVT) ==
+      TargetLowering::TypePromoteInteger) {
+   EltVT = TLI.getTypeToTransformTo(*getContext(), EltVT);
+   APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
+   Elt = ConstantInt::get(*getContext(), NewVal);
+  }
+
+  assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
+         "APInt size does not match type size!");
   unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(EltVT), 0, 0);
-  ID.AddPointer(&Val);
+  ID.AddPointer(Elt);
   void *IP = 0;
   SDNode *N = NULL;
   if ((N = CSEMap.FindNodeOrInsertPos(ID, IP)))
@@ -939,7 +962,7 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
       return SDValue(N, 0);
 
   if (!N) {
-    N = new (NodeAllocator) ConstantSDNode(isT, &Val, EltVT);
+    N = new (NodeAllocator) ConstantSDNode(isT, Elt, EltVT);
     CSEMap.InsertNode(N, IP);
     AllNodes.push_back(N);
   }
@@ -1131,7 +1154,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
   ID.AddInteger(Alignment);
   ID.AddInteger(Offset);
-  C->AddSelectionDAGCSEId(ID);
+  C->addSelectionDAGCSEId(ID);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
@@ -1432,7 +1455,7 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
 SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
   unsigned ByteSize = VT.getStoreSize();
-  const Type *Ty = VT.getTypeForEVT(*getContext());
+  Type *Ty = VT.getTypeForEVT(*getContext());
   unsigned StackAlign =
   std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty), minAlign);
 
@@ -1445,8 +1468,8 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
 SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
   unsigned Bytes = std::max(VT1.getStoreSizeInBits(),
                             VT2.getStoreSizeInBits())/8;
-  const Type *Ty1 = VT1.getTypeForEVT(*getContext());
-  const Type *Ty2 = VT2.getTypeForEVT(*getContext());
+  Type *Ty1 = VT1.getTypeForEVT(*getContext());
+  Type *Ty2 = VT2.getTypeForEVT(*getContext());
   const TargetData *TD = TLI.getTargetData();
   unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
                             TD->getPrefTypeAlignment(Ty2));
@@ -1718,8 +1741,8 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, const APInt &Mask,
     // The boolean result conforms to getBooleanContents.  Fall through.
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
-    if (TLI.getBooleanContents() == TargetLowering::ZeroOrOneBooleanContent &&
-        BitWidth > 1)
+    if (TLI.getBooleanContents(Op.getValueType().isVector()) ==
+        TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1)
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
     return;
   case ISD::SHL:
@@ -2153,7 +2176,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     // The boolean result conforms to getBooleanContents.  Fall through.
   case ISD::SETCC:
     // If setcc returns 0/-1, all bits are sign bits.
-    if (TLI.getBooleanContents() ==
+    if (TLI.getBooleanContents(Op.getValueType().isVector()) ==
         TargetLowering::ZeroOrNegativeOneBooleanContent)
       return VTBits;
     break;
@@ -2437,7 +2460,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
                               APFloat::rmTowardZero, &ignored);
         if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
           break;
-        APInt api(VT.getSizeInBits(), 2, x);
+        APInt api(VT.getSizeInBits(), x);
         return getConstant(api, VT);
       }
       case ISD::BITCAST:
@@ -2777,6 +2800,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
             EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
            "Vector element counts must match in FP_ROUND_INREG");
     assert(EVT.bitsLE(VT) && "Not rounding down!");
+    (void)EVT;
     if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
     break;
   }
@@ -2884,6 +2908,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
     assert(!N1.getValueType().isVector() && !VT.isVector() &&
            (N1.getValueType().isInteger() == VT.isInteger()) &&
+           N1.getValueType() != VT &&
            "Wrong types for EXTRACT_ELEMENT!");
 
     // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
@@ -3425,7 +3450,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
     return SDValue();
 
   if (DstAlignCanChange) {
-    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
     unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
@@ -3514,7 +3539,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
     return SDValue();
 
   if (DstAlignCanChange) {
-    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
     unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
@@ -3589,7 +3614,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
     return SDValue();
 
   if (DstAlignCanChange) {
-    const Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
     unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
@@ -3782,7 +3807,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
     return Result;
 
   // Emit a library call.
-  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*getContext());
+  Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
@@ -3815,7 +3840,9 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
 SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Cmp,
                                 SDValue Swp, MachinePointerInfo PtrInfo,
-                                unsigned Alignment) {
+                                unsigned Alignment,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {                                
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
@@ -3823,18 +3850,23 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   unsigned Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
   // For now, atomics are considered to be volatile always.
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
   Flags |= MachineMemOperand::MOVolatile;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
 
-  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Cmp, Swp, MMO);
+  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Cmp, Swp, MMO,
+                   Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                                 SDValue Chain,
                                 SDValue Ptr, SDValue Cmp,
-                                SDValue Swp, MachineMemOperand *MMO) {
+                                SDValue Swp, MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
   assert(Opcode == ISD::ATOMIC_CMP_SWAP && "Invalid Atomic Op");
   assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
 
@@ -3851,7 +3883,8 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
     return SDValue(E, 0);
   }
   SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
-                                               Ptr, Cmp, Swp, MMO);
+                                               Ptr, Cmp, Swp, MMO, Ordering,
+                                               SynchScope);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -3861,27 +3894,39 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                                 SDValue Chain,
                                 SDValue Ptr, SDValue Val,
                                 const Value* PtrVal,
-                                unsigned Alignment) {
+                                unsigned Alignment,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  unsigned Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+  // A monotonic store does not load; a release store "loads" in the sense
+  // that other stores cannot be sunk past it.
+  // (An atomicrmw obviously both loads and stores.)
+  unsigned Flags = MachineMemOperand::MOStore;
+  if (Opcode != ISD::ATOMIC_STORE || Ordering > Monotonic)
+    Flags |= MachineMemOperand::MOLoad;
 
   // For now, atomics are considered to be volatile always.
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
   Flags |= MachineMemOperand::MOVolatile;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
                             MemVT.getStoreSize(), Alignment);
 
-  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO);
+  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO,
+                   Ordering, SynchScope);
 }
 
 SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                                 SDValue Chain,
                                 SDValue Ptr, SDValue Val,
-                                MachineMemOperand *MMO) {
+                                MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
   assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
           Opcode == ISD::ATOMIC_LOAD_SUB ||
           Opcode == ISD::ATOMIC_LOAD_AND ||
@@ -3892,12 +3937,14 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
           Opcode == ISD::ATOMIC_LOAD_MAX ||
           Opcode == ISD::ATOMIC_LOAD_UMIN ||
           Opcode == ISD::ATOMIC_LOAD_UMAX ||
-          Opcode == ISD::ATOMIC_SWAP) &&
+          Opcode == ISD::ATOMIC_SWAP ||
+          Opcode == ISD::ATOMIC_STORE) &&
          "Invalid Atomic Op");
 
   EVT VT = Val.getValueType();
 
-  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
+                                               getVTList(VT, MVT::Other);
   FoldingSetNodeID ID;
   ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Val};
@@ -3908,7 +3955,63 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
     return SDValue(E, 0);
   }
   SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
-                                               Ptr, Val, MMO);
+                                               Ptr, Val, MMO,
+                                               Ordering, SynchScope);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                EVT VT, SDValue Chain,
+                                SDValue Ptr,
+                                const Value* PtrVal,
+                                unsigned Alignment,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+  // A monotonic load does not store; an acquire load "stores" in the sense
+  // that other loads cannot be hoisted past it.
+  unsigned Flags = MachineMemOperand::MOLoad;
+  if (Ordering > Monotonic)
+    Flags |= MachineMemOperand::MOStore;
+
+  // For now, atomics are considered to be volatile always.
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
+  Flags |= MachineMemOperand::MOVolatile;
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
+                            MemVT.getStoreSize(), Alignment);
+
+  return getAtomic(Opcode, dl, MemVT, VT, Chain, Ptr, MMO,
+                   Ordering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+                                EVT VT, SDValue Chain,
+                                SDValue Ptr,
+                                MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  SDValue Ops[] = {Chain, Ptr};
+  AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
+                                               Ptr, MMO, Ordering, SynchScope);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -5769,6 +5872,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
 #endif
   case ISD::PREFETCH:      return "Prefetch";
   case ISD::MEMBARRIER:    return "MemBarrier";
+  case ISD::ATOMIC_FENCE:    return "AtomicFence";
   case ISD::ATOMIC_CMP_SWAP:    return "AtomicCmpSwap";
   case ISD::ATOMIC_SWAP:        return "AtomicSwap";
   case ISD::ATOMIC_LOAD_ADD:    return "AtomicLoadAdd";
@@ -5781,6 +5885,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::ATOMIC_LOAD_MAX:    return "AtomicLoadMax";
   case ISD::ATOMIC_LOAD_UMIN:   return "AtomicLoadUMin";
   case ISD::ATOMIC_LOAD_UMAX:   return "AtomicLoadUMax";
+  case ISD::ATOMIC_LOAD:        return "AtomicLoad";
+  case ISD::ATOMIC_STORE:       return "AtomicStore";
   case ISD::PCMARKER:      return "PCMarker";
   case ISD::READCYCLECOUNTER: return "ReadCycleCounter";
   case ISD::SRCVALUE:      return "SrcValue";
@@ -5896,8 +6002,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
 
   case ISD::FPOWI:  return "fpowi";
   case ISD::SETCC:       return "setcc";
-  case ISD::VSETCC:      return "vsetcc";
   case ISD::SELECT:      return "select";
+  case ISD::VSELECT:     return "vselect";
   case ISD::SELECT_CC:   return "select_cc";
   case ISD::INSERT_VECTOR_ELT:   return "insert_vector_elt";
   case ISD::EXTRACT_VECTOR_ELT:  return "extract_vector_elt";
@@ -5985,7 +6091,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CTLZ:    return "ctlz";
 
   // Trampolines
-  case ISD::TRAMPOLINE: return "trampoline";
+  case ISD::INIT_TRAMPOLINE: return "init_trampoline";
+  case ISD::ADJUST_TRAMPOLINE: return "adjust_trampoline";
 
   case ISD::CONDCODE:
     switch (cast<CondCodeSDNode>(this)->get()) {
@@ -6245,8 +6352,7 @@ void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
 
 static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
                                   const SelectionDAG *G, unsigned depth,
-                                  unsigned indent)
-{
+                                  unsigned indent) {
   if (depth == 0)
     return;
 
@@ -6340,6 +6446,10 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
                                 &Operands[0], Operands.size()));
       break;
+    case ISD::VSELECT:
+      Scalars.push_back(getNode(ISD::SELECT, dl, EltVT,
+                                &Operands[0], Operands.size()));
+      break;
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
@@ -6427,6 +6537,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
           Align = TD->getPreferredAlignment(GVar);
         }
       }
+      if (!Align)
+        Align = TLI.getTargetData()->getABITypeAlignment(GV->getType());
     }
     return MinAlign(Align, GVOffset);
   }
@@ -6528,7 +6640,7 @@ unsigned GlobalAddressSDNode::getAddressSpace() const {
 }
 
 
-const Type *ConstantPoolSDNode::getType() const {
+Type *ConstantPoolSDNode::getType() const {
   if (isMachineConstantPoolEntry())
     return Val.MachineCPVal->getType();
   return Val.ConstVal->getType();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 81b03ee..7ed46a6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -578,7 +578,7 @@ namespace {
       : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
 
     RegsForValue(LLVMContext &Context, const TargetLowering &tli,
-                 unsigned Reg, const Type *Ty) {
+                 unsigned Reg, Type *Ty) {
       ComputeValueVTs(tli, Ty, ValueVTs);
 
       for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
@@ -788,6 +788,18 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
   if (HasMatching)
     Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
+  else if (!Regs.empty() &&
+           TargetRegisterInfo::isVirtualRegister(Regs.front())) {
+    // Put the register class of the virtual registers in the flag word.  That
+    // way, later passes can recompute register class constraints for inline
+    // assembly as well as normal instructions.
+    // Don't do this for tied operands that can use the regclass information
+    // from the def.
+    const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
+    Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+  }
+
   SDValue Res = DAG.getTargetConstant(Flag, MVT::i32);
   Ops.push_back(Res);
 
@@ -805,6 +817,7 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa) {
   AA = &aa;
   GFI = gfi;
   TD = DAG.getTarget().getTargetData();
+  LPadToCallSiteMap.clear();
 }
 
 /// clear - Clear out the current SelectionDAG and the associated
@@ -956,7 +969,7 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
   }
 }
 
-// getValue - Return an SDValue for the given Value.
+/// getValue - Return an SDValue for the given Value.
 SDValue SelectionDAGBuilder::getValue(const Value *V) {
   // If we already have an SDValue for this value, use it. It's important
   // to do this first, so that we don't create a CopyFromReg if we already
@@ -971,7 +984,7 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
     unsigned InReg = It->second;
     RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
     SDValue Chain = DAG.getEntryNode();
-    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain,NULL);
+    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
     resolveDanglingDebugInfo(V, N);
     return N;
   }
@@ -1069,7 +1082,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
       return DAG.getBlockAddress(BA, VT);
 
-    const VectorType *VecTy = cast<VectorType>(V->getType());
+    VectorType *VecTy = cast<VectorType>(V->getType());
     unsigned NumElements = VecTy->getNumElements();
 
     // Now that we know the number and type of the elements, get that number of
@@ -1277,15 +1290,17 @@ uint32_t SelectionDAGBuilder::getEdgeWeight(MachineBasicBlock *Src,
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   if (!BPI)
     return 0;
-  BasicBlock *SrcBB = const_cast<BasicBlock*>(Src->getBasicBlock());
-  BasicBlock *DstBB = const_cast<BasicBlock*>(Dst->getBasicBlock());
+  const BasicBlock *SrcBB = Src->getBasicBlock();
+  const BasicBlock *DstBB = Dst->getBasicBlock();
   return BPI->getEdgeWeight(SrcBB, DstBB);
 }
 
-void SelectionDAGBuilder::addSuccessorWithWeight(MachineBasicBlock *Src,
-                                                 MachineBasicBlock *Dst) {
-  uint32_t weight = getEdgeWeight(Src, Dst);
-  Src->addSuccessor(Dst, weight);
+void SelectionDAGBuilder::
+addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst,
+                       uint32_t Weight /* = 0 */) {
+  if (!Weight)
+    Weight = getEdgeWeight(Src, Dst);
+  Src->addSuccessor(Dst, Weight);
 }
 
 
@@ -1558,8 +1573,8 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
   }
 
   // Update successor info
-  addSuccessorWithWeight(SwitchBB, CB.TrueBB);
-  addSuccessorWithWeight(SwitchBB, CB.FalseBB);
+  addSuccessorWithWeight(SwitchBB, CB.TrueBB, CB.TrueWeight);
+  addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1677,7 +1692,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
     UsePtrType = true;
   else {
     for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
-      if ((uint64_t)((int64_t)B.Cases[i].Mask >> VT.getSizeInBits()) + 1 >= 2) {
+      if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
         // Switch table case range are encoded into series of masks.
         // Just use pointer type, it's guaranteed to fit.
         UsePtrType = true;
@@ -1808,6 +1823,49 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
 void SelectionDAGBuilder::visitUnwind(const UnwindInst &I) {
 }
 
+void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
+}
+
+void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
+  assert(FuncInfo.MBB->isLandingPad() &&
+         "Call to landingpad not in landing pad!");
+
+  MachineBasicBlock *MBB = FuncInfo.MBB;
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  AddLandingPadInfo(LP, MMI, MBB);
+
+  SmallVector<EVT, 2> ValueVTs;
+  ComputeValueVTs(TLI, LP.getType(), ValueVTs);
+
+  // Insert the EXCEPTIONADDR instruction.
+  assert(FuncInfo.MBB->isLandingPad() &&
+         "Call to eh.exception not in landing pad!");
+  SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
+  SDValue Ops[2];
+  Ops[0] = DAG.getRoot();
+  SDValue Op1 = DAG.getNode(ISD::EXCEPTIONADDR, getCurDebugLoc(), VTs, Ops, 1);
+  SDValue Chain = Op1.getValue(1);
+
+  // Insert the EHSELECTION instruction.
+  VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
+  Ops[0] = Op1;
+  Ops[1] = Chain;
+  SDValue Op2 = DAG.getNode(ISD::EHSELECTION, getCurDebugLoc(), VTs, Ops, 2);
+  Chain = Op2.getValue(1);
+  Op2 = DAG.getSExtOrTrunc(Op2, getCurDebugLoc(), MVT::i32);
+
+  Ops[0] = Op1;
+  Ops[1] = Op2;
+  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+                            DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
+                            &Ops[0], 2);
+
+  std::pair<SDValue, SDValue> RetPair = std::make_pair(Res, Chain);
+  setValue(&LP, RetPair.first);
+  DAG.setRoot(RetPair.second);
+}
+
 /// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
 /// small case ranges).
 bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
@@ -1866,8 +1924,8 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
                                     ISD::SETEQ);
 
         // Update successor info.
-        SwitchBB->addSuccessor(Small.BB);
-        SwitchBB->addSuccessor(Default);
+        addSuccessorWithWeight(SwitchBB, Small.BB);
+        addSuccessorWithWeight(SwitchBB, Default);
 
         // Insert the true branch.
         SDValue BrCond = DAG.getNode(ISD::BRCOND, DL, MVT::Other,
@@ -1923,7 +1981,11 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
       CC = ISD::SETLE;
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
-    CaseBlock CB(CC, LHS, RHS, MHS, I->BB, FallThrough, CurBlock);
+
+    uint32_t ExtraWeight = I->ExtraWeight;
+    CaseBlock CB(CC, LHS, RHS, MHS, /* truebb */ I->BB, /* falsebb */ FallThrough,
+                 /* me */ CurBlock,
+                 /* trueweight */ ExtraWeight / 2, /* falseweight */ ExtraWeight / 2);
 
     // If emitting the first comparison, just call visitSwitchCase to emit the
     // code into the current block.  Otherwise, push the CaseBlock onto the
@@ -1953,10 +2015,10 @@ static APInt ComputeRange(const APInt &First, const APInt &Last) {
 }
 
 /// handleJTSwitchCase - Emit jumptable for current switch case range
-bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR,
-                                             CaseRecVector& WorkList,
-                                             const Value* SV,
-                                             MachineBasicBlock* Default,
+bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
+                                             CaseRecVector &WorkList,
+                                             const Value *SV,
+                                             MachineBasicBlock *Default,
                                              MachineBasicBlock *SwitchBB) {
   Case& FrontCase = *CR.Range.first;
   Case& BackCase  = *(CR.Range.second-1);
@@ -1965,8 +2027,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR,
   const APInt &Last  = cast<ConstantInt>(BackCase.High)->getValue();
 
   APInt TSize(First.getBitWidth(), 0);
-  for (CaseItr I = CR.Range.first, E = CR.Range.second;
-       I!=E; ++I)
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I)
     TSize += I->size();
 
   if (!areJTsAllowed(TLI) || TSize.ult(4))
@@ -2044,7 +2105,6 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec& CR,
     visitJumpTableHeader(JT, JTH, SwitchBB);
 
   JTCases.push_back(JumpTableBlock(JTH, JT));
-
   return true;
 }
 
@@ -2318,12 +2378,17 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                        const SwitchInst& SI) {
   size_t numCmps = 0;
 
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
   for (size_t i = 1; i < SI.getNumSuccessors(); ++i) {
-    MachineBasicBlock *SMBB = FuncInfo.MBBMap[SI.getSuccessor(i)];
+    BasicBlock *SuccBB = SI.getSuccessor(i);
+    MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
+
+    uint32_t ExtraWeight = BPI ? BPI->getEdgeWeight(SI.getParent(), SuccBB) : 0;
+
     Cases.push_back(Case(SI.getSuccessorValue(i),
                          SI.getSuccessorValue(i),
-                         SMBB));
+                         SMBB, ExtraWeight));
   }
   std::sort(Cases.begin(), Cases.end(), CaseCmp());
 
@@ -2343,6 +2408,16 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
       if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
         I->High = J->High;
         J = Cases.erase(J);
+
+        if (BranchProbabilityInfo *BPI = FuncInfo.BPI) {
+          uint32_t CurWeight = currentBB->getBasicBlock() ?
+            BPI->getEdgeWeight(SI.getParent(), currentBB->getBasicBlock()) : 16;
+          uint32_t NextWeight = nextBB->getBasicBlock() ?
+            BPI->getEdgeWeight(SI.getParent(), nextBB->getBasicBlock()) : 16;
+
+          BPI->setEdgeWeight(SI.getParent(), currentBB->getBasicBlock(),
+                             CurWeight + NextWeight);
+        }
       } else {
         I = J++;
       }
@@ -2379,7 +2454,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 
   // If there is only the default destination, branch to it if it is not the
   // next basic block.  Otherwise, just fall through.
-  if (SI.getNumOperands() == 2) {
+  if (SI.getNumCases() == 1) {
     // Update machine-CFG edges.
 
     // If this is not a fall-through branch, emit the branch.
@@ -2399,12 +2474,12 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
   size_t numCmps = Clusterify(Cases, SI);
   DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
                << ". Total compares: " << numCmps << '\n');
-  numCmps = 0;
+  (void)numCmps;
 
   // Get the Value to be switched on and default basic blocks, which will be
   // inserted into CaseBlock records, representing basic blocks in the binary
   // search tree.
-  const Value *SV = SI.getOperand(0);
+  const Value *SV = SI.getCondition();
 
   // Push the initial CaseRec onto the worklist
   CaseRecVector WorkList;
@@ -2458,7 +2533,7 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
 
 void SelectionDAGBuilder::visitFSub(const User &I) {
   // -0.0 - X --> fneg
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
   if (isa<Constant>(I.getOperand(0)) &&
       I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
     SDValue Op2 = getValue(I.getOperand(1));
@@ -2562,10 +2637,12 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
   SDValue Cond     = getValue(I.getOperand(0));
   SDValue TrueVal  = getValue(I.getOperand(1));
   SDValue FalseVal = getValue(I.getOperand(2));
+  ISD::NodeType OpCode = Cond.getValueType().isVector() ?
+    ISD::VSELECT : ISD::SELECT;
 
   for (unsigned i = 0; i != NumValues; ++i)
-    Values[i] = DAG.getNode(ISD::SELECT, getCurDebugLoc(),
-                          TrueVal.getNode()->getValueType(TrueVal.getResNo()+i),
+    Values[i] = DAG.getNode(OpCode, getCurDebugLoc(),
+                            TrueVal.getNode()->getValueType(TrueVal.getResNo()+i),
                             Cond,
                             SDValue(TrueVal.getNode(),
                                     TrueVal.getResNo() + i),
@@ -2778,7 +2855,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     // Analyze the access pattern of the vector to see if we can extract
     // two subvectors and do the shuffle. The analysis is done by calculating
     // the range of elements the mask access on both vectors.
-    int MinRange[2] = { SrcNumElts+1, SrcNumElts+1};
+    int MinRange[2] = { static_cast<int>(SrcNumElts+1),
+                        static_cast<int>(SrcNumElts+1)};
     int MaxRange[2] = {-1, -1};
 
     for (unsigned i = 0; i != MaskNumElts; ++i) {
@@ -2886,8 +2964,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
 void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
   const Value *Op0 = I.getOperand(0);
   const Value *Op1 = I.getOperand(1);
-  const Type *AggTy = I.getType();
-  const Type *ValTy = Op1->getType();
+  Type *AggTy = I.getType();
+  Type *ValTy = Op1->getType();
   bool IntoUndef = isa<UndefValue>(Op0);
   bool FromUndef = isa<UndefValue>(Op1);
 
@@ -2927,8 +3005,8 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
 
 void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
   const Value *Op0 = I.getOperand(0);
-  const Type *AggTy = Op0->getType();
-  const Type *ValTy = I.getType();
+  Type *AggTy = Op0->getType();
+  Type *ValTy = I.getType();
   bool OutOfUndef = isa<UndefValue>(Op0);
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
@@ -2961,12 +3039,12 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
 
 void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   SDValue N = getValue(I.getOperand(0));
-  const Type *Ty = I.getOperand(0)->getType();
+  Type *Ty = I.getOperand(0)->getType();
 
   for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
        OI != E; ++OI) {
     const Value *Idx = *OI;
-    if (const StructType *StTy = dyn_cast<StructType>(Ty)) {
+    if (StructType *StTy = dyn_cast<StructType>(Ty)) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       if (Field) {
         // N = N + Offset
@@ -3037,7 +3115,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   if (FuncInfo.StaticAllocaMap.count(&I))
     return;   // getValue will auto-populate this.
 
-  const Type *Ty = I.getAllocatedType();
+  Type *Ty = I.getAllocatedType();
   uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
   unsigned Align =
     std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
@@ -3084,10 +3162,13 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
 }
 
 void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
+  if (I.isAtomic())
+    return visitAtomicLoad(I);
+
   const Value *SV = I.getOperand(0);
   SDValue Ptr = getValue(SV);
 
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
 
   bool isVolatile = I.isVolatile();
   bool isNonTemporal = I.getMetadata("nontemporal") != 0;
@@ -3161,6 +3242,9 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
 }
 
 void SelectionDAGBuilder::visitStore(const StoreInst &I) {
+  if (I.isAtomic())
+    return visitAtomicStore(I);
+
   const Value *SrcV = I.getOperand(0);
   const Value *PtrV = I.getOperand(1);
 
@@ -3211,6 +3295,179 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   DAG.setRoot(StoreNode);
 }
 
+static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
+                                    SynchronizationScope Scope,
+                                    bool Before, DebugLoc dl,
+                                    SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  // Fence, if necessary
+  if (Before) {
+    if (Order == AcquireRelease || Order == SequentiallyConsistent)
+      Order = Release;
+    else if (Order == Acquire || Order == Monotonic)
+      return Chain;
+  } else {
+    if (Order == AcquireRelease)
+      Order = Acquire;
+    else if (Order == Release || Order == Monotonic)
+      return Chain;
+  }
+  SDValue Ops[3];
+  Ops[0] = Chain;
+  Ops[1] = DAG.getConstant(Order, TLI.getPointerTy());
+  Ops[2] = DAG.getConstant(Scope, TLI.getPointerTy());
+  return DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops, 3);
+}
+
+void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  if (TLI.getInsertFencesForAtomic())
+    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
+                                   DAG, TLI);
+
+  SDValue L =
+    DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
+                  getValue(I.getCompareOperand()).getValueType().getSimpleVT(),
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getCompareOperand()),
+                  getValue(I.getNewValOperand()),
+                  MachinePointerInfo(I.getPointerOperand()), 0 /* Alignment */,
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  ISD::NodeType NT;
+  switch (I.getOperation()) {
+  default: llvm_unreachable("Unknown atomicrmw operation"); return;
+  case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
+  case AtomicRMWInst::Add:  NT = ISD::ATOMIC_LOAD_ADD; break;
+  case AtomicRMWInst::Sub:  NT = ISD::ATOMIC_LOAD_SUB; break;
+  case AtomicRMWInst::And:  NT = ISD::ATOMIC_LOAD_AND; break;
+  case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
+  case AtomicRMWInst::Or:   NT = ISD::ATOMIC_LOAD_OR; break;
+  case AtomicRMWInst::Xor:  NT = ISD::ATOMIC_LOAD_XOR; break;
+  case AtomicRMWInst::Max:  NT = ISD::ATOMIC_LOAD_MAX; break;
+  case AtomicRMWInst::Min:  NT = ISD::ATOMIC_LOAD_MIN; break;
+  case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
+  case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
+  }
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  if (TLI.getInsertFencesForAtomic())
+    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
+                                   DAG, TLI);
+
+  SDValue L =
+    DAG.getAtomic(NT, dl,
+                  getValue(I.getValOperand()).getValueType().getSimpleVT(),
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValOperand()),
+                  I.getPointerOperand(), 0 /* Alignment */,
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitFence(const FenceInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  SDValue Ops[3];
+  Ops[0] = getRoot();
+  Ops[1] = DAG.getConstant(I.getOrdering(), TLI.getPointerTy());
+  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI.getPointerTy());
+  DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops, 3));
+}
+
+void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  EVT VT = EVT::getEVT(I.getType());
+
+  if (I.getAlignment() * 8 < VT.getSizeInBits())
+    report_fatal_error("Cannot generate unaligned atomic load");
+
+  SDValue L =
+    DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
+                  getValue(I.getPointerOperand()),
+                  I.getPointerOperand(), I.getAlignment(),
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
+  DebugLoc dl = getCurDebugLoc();
+
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  EVT VT = EVT::getEVT(I.getValueOperand()->getType());
+
+  if (I.getAlignment() * 8 < VT.getSizeInBits())
+    report_fatal_error("Cannot generate unaligned atomic store");
+
+  if (TLI.getInsertFencesForAtomic())
+    InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
+                                   DAG, TLI);
+
+  SDValue OutChain =
+    DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValueOperand()),
+                  I.getPointerOperand(), I.getAlignment(),
+                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  Scope);
+
+  if (TLI.getInsertFencesForAtomic())
+    OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
+                                    DAG, TLI);
+
+  DAG.setRoot(OutChain);
+}
+
 /// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
 /// node.
 void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
@@ -3290,7 +3547,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   }
 
   if (!I.getType()->isVoidTy()) {
-    if (const VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
+    if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
       EVT VT = TLI.getValueType(PTy);
       Result = DAG.getNode(ISD::BITCAST, getCurDebugLoc(), VT, Result);
     }
@@ -3337,25 +3594,6 @@ getF32Constant(SelectionDAG &DAG, unsigned Flt) {
   return DAG.getConstantFP(APFloat(APInt(32, Flt)), MVT::f32);
 }
 
-/// Inlined utility function to implement binary input atomic intrinsics for
-/// visitIntrinsicCall: I is a call instruction
-///                     Op is the associated NodeType for I
-const char *
-SelectionDAGBuilder::implVisitBinaryAtomic(const CallInst& I,
-                                           ISD::NodeType Op) {
-  SDValue Root = getRoot();
-  SDValue L =
-    DAG.getAtomic(Op, getCurDebugLoc(),
-                  getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
-                  Root,
-                  getValue(I.getArgOperand(0)),
-                  getValue(I.getArgOperand(1)),
-                  I.getArgOperand(0));
-  setValue(&I, L);
-  DAG.setRoot(L.getValue(1));
-  return 0;
-}
-
 // implVisitAluOverflow - Lower arithmetic overflow instrinsics.
 const char *
 SelectionDAGBuilder::implVisitAluOverflow(const CallInst &I, ISD::NodeType Op) {
@@ -4154,17 +4392,12 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
     return false;
 
   unsigned Reg = 0;
-  if (Arg->hasByValAttr()) {
-    // Byval arguments' frame index is recorded during argument lowering.
-    // Use this info directly.
-    Reg = TRI->getFrameRegister(MF);
-    Offset = FuncInfo.getByValArgumentFrameIndex(Arg);
-    // If byval argument ofset is not recorded then ignore this.
-    if (!Offset)
-      Reg = 0;
-  }
+  // Some arguments' frame index is recorded during argument lowering.
+  Offset = FuncInfo.getArgumentFrameIndex(Arg);
+  if (Offset)
+      Reg = TRI->getFrameRegister(MF);
 
-  if (N.getNode()) {
+  if (!Reg && N.getNode()) {
     if (N.getOpcode() == ISD::CopyFromReg)
       Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
     else
@@ -4295,7 +4528,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
     MDNode *Variable = DI.getVariable();
     const Value *Address = DI.getAddress();
-    if (!Address || !DIVariable(DI.getVariable()).Verify())
+    if (!Address || !DIVariable(Variable).Verify())
       return 0;
 
     // Build an entry in DbgOrdering.  Debug info input nodes get an SDNodeOrder
@@ -4385,7 +4618,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // debug info exists.
     ++SDNodeOrder;
     SDDbgValue *SDV;
-    if (isa<ConstantInt>(V) || isa<ConstantFP>(V)) {
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
       SDV = DAG.getDbgValue(Variable, V, Offset, dl, SDNodeOrder);
       DAG.AddDbgValue(SDV, 0, false);
     } else {
@@ -4514,9 +4747,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     MMI.setCurrentCallSite(CI->getZExtValue());
     return 0;
   }
+  case Intrinsic::eh_sjlj_functioncontext: {
+    // Get and store the index of the function context.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    AllocaInst *FnCtx =
+      cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
+    int FI = FuncInfo.StaticAllocaMap[FnCtx];
+    MFI->setFunctionContextIndex(FI);
+    return 0;
+  }
   case Intrinsic::eh_sjlj_setjmp: {
-    setValue(&I, DAG.getNode(ISD::EH_SJLJ_SETJMP, dl, MVT::i32, getRoot(),
-                             getValue(I.getArgOperand(0))));
+    SDValue Ops[2];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, dl,
+                             DAG.getVTList(MVT::i32, MVT::Other),
+                             Ops, 2);
+    setValue(&I, Op.getValue(0));
+    DAG.setRoot(Op.getValue(1));
     return 0;
   }
   case Intrinsic::eh_sjlj_longjmp: {
@@ -4778,12 +5026,15 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
     Ops[5] = DAG.getSrcValue(F);
 
-    Res = DAG.getNode(ISD::TRAMPOLINE, dl,
-                      DAG.getVTList(TLI.getPointerTy(), MVT::Other),
-                      Ops, 6);
+    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, dl, MVT::Other, Ops, 6);
 
-    setValue(&I, Res);
-    DAG.setRoot(Res.getValue(1));
+    DAG.setRoot(Res);
+    return 0;
+  }
+  case Intrinsic::adjust_trampoline: {
+    setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, dl,
+                             TLI.getPointerTy(),
+                             getValue(I.getArgOperand(0))));
     return 0;
   }
   case Intrinsic::gcroot:
@@ -4857,51 +5108,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                                         rw==1)); /* write */
     return 0;
   }
-  case Intrinsic::memory_barrier: {
-    SDValue Ops[6];
-    Ops[0] = getRoot();
-    for (int x = 1; x < 6; ++x)
-      Ops[x] = getValue(I.getArgOperand(x - 1));
-
-    DAG.setRoot(DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, &Ops[0], 6));
-    return 0;
-  }
-  case Intrinsic::atomic_cmp_swap: {
-    SDValue Root = getRoot();
-    SDValue L =
-      DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, getCurDebugLoc(),
-                    getValue(I.getArgOperand(1)).getValueType().getSimpleVT(),
-                    Root,
-                    getValue(I.getArgOperand(0)),
-                    getValue(I.getArgOperand(1)),
-                    getValue(I.getArgOperand(2)),
-                    MachinePointerInfo(I.getArgOperand(0)));
-    setValue(&I, L);
-    DAG.setRoot(L.getValue(1));
-    return 0;
-  }
-  case Intrinsic::atomic_load_add:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_ADD);
-  case Intrinsic::atomic_load_sub:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_SUB);
-  case Intrinsic::atomic_load_or:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_OR);
-  case Intrinsic::atomic_load_xor:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_XOR);
-  case Intrinsic::atomic_load_and:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_AND);
-  case Intrinsic::atomic_load_nand:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_NAND);
-  case Intrinsic::atomic_load_max:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MAX);
-  case Intrinsic::atomic_load_min:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_MIN);
-  case Intrinsic::atomic_load_umin:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMIN);
-  case Intrinsic::atomic_load_umax:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_LOAD_UMAX);
-  case Intrinsic::atomic_swap:
-    return implVisitBinaryAtomic(I, ISD::ATOMIC_SWAP);
 
   case Intrinsic::invariant_start:
   case Intrinsic::lifetime_start:
@@ -4918,9 +5124,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                                       bool isTailCall,
                                       MachineBasicBlock *LandingPad) {
-  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
-  const Type *RetTy = FTy->getReturnType();
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  Type *RetTy = FTy->getReturnType();
   MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
   MCSymbol *BeginLabel = 0;
 
@@ -4949,7 +5155,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                       FTy->getReturnType());
     MachineFunction &MF = DAG.getMachineFunction();
     DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
-    const Type *StackSlotPtrType = PointerType::getUnqual(FTy->getReturnType());
+    Type *StackSlotPtrType = PointerType::getUnqual(FTy->getReturnType());
 
     DemoteStackSlot = DAG.getFrameIndex(DemoteStackIdx, TLI.getPointerTy());
     Entry.Node = DemoteStackSlot;
@@ -4997,6 +5203,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     unsigned CallSiteIndex = MMI.getCurrentCallSite();
     if (CallSiteIndex) {
       MMI.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
+      LPadToCallSiteMap[LandingPad].push_back(CallSiteIndex);
+
       // Now that the call site is handled, stop tracking it.
       MMI.setCurrentCallSite(0);
     }
@@ -5037,7 +5245,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     // The instruction result is the result of loading from the
     // hidden sret parameter.
     SmallVector<EVT, 1> PVTs;
-    const Type *PtrRetTy = PointerType::getUnqual(FTy->getReturnType());
+    Type *PtrRetTy = PointerType::getUnqual(FTy->getReturnType());
 
     ComputeValueVTs(TLI, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
@@ -5130,7 +5338,7 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
 }
 
 static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
-                             const Type *LoadTy,
+                             Type *LoadTy,
                              SelectionDAGBuilder &Builder) {
 
   // Check to see if this load can be trivially constant folded, e.g. if the
@@ -5193,7 +5401,7 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
     bool ActuallyDoIt = true;
     MVT LoadVT;
-    const Type *LoadTy;
+    Type *LoadTy;
     switch (Size->getZExtValue()) {
     default:
       LoadVT = MVT::Other;
@@ -5261,14 +5469,14 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
 
   // See if any floating point values are being passed to this function. This is
   // used to emit an undefined reference to fltused on Windows.
-  const FunctionType *FT =
+  FunctionType *FT =
     cast<FunctionType>(I.getCalledValue()->getType()->getContainedType(0));
   MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
   if (FT->isVarArg() &&
       !MMI.callsExternalVAFunctionWithFloatingPointArguments()) {
     for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
-      const Type* T = I.getArgOperand(i)->getType();
-      for (po_iterator<const Type*> i = po_begin(T), e = po_end(T);
+      Type* T = I.getArgOperand(i)->getType();
+      for (po_iterator<Type*> i = po_begin(T), e = po_end(T);
            i != e; ++i) {
         if (!i->isFloatingPointTy()) continue;
         MMI.setCallsExternalVAFunctionWithFloatingPointArguments(true);
@@ -5412,20 +5620,20 @@ public:
     if (isa<BasicBlock>(CallOperandVal))
       return TLI.getPointerTy();
 
-    const llvm::Type *OpTy = CallOperandVal->getType();
+    llvm::Type *OpTy = CallOperandVal->getType();
 
     // FIXME: code duplicated from TargetLowering::ParseConstraints().
     // If this is an indirect operand, the operand is a pointer to the
     // accessed type.
     if (isIndirect) {
-      const llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+      llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
       if (!PtrTy)
         report_fatal_error("Indirect operand for inline asm not a pointer!");
       OpTy = PtrTy->getElementType();
     }
 
     // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
-    if (const StructType *STy = dyn_cast<StructType>(OpTy))
+    if (StructType *STy = dyn_cast<StructType>(OpTy))
       if (STy->getNumElements() == 1)
         OpTy = STy->getElementType(0);
 
@@ -5637,9 +5845,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       // The return value of the call is this value.  As such, there is no
       // corresponding argument.
-      assert(!CS.getType()->isVoidTy() &&
-             "Bad inline asm!");
-      if (const StructType *STy = dyn_cast<StructType>(CS.getType())) {
+      assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
         OpVT = TLI.getValueType(STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
@@ -5707,9 +5914,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
 	std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-	  TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT);
+	  TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                           OpInfo.ConstraintVT);
 	std::pair<unsigned, const TargetRegisterClass*> InputRC =
-	  TLI.getRegForInlineAsmConstraint(Input.ConstraintCode, Input.ConstraintVT);
+	  TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
+                                           Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
@@ -5750,7 +5959,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       } else {
         // Otherwise, create a stack slot and emit a store to it before the
         // asm.
-        const Type *Ty = OpVal->getType();
+        Type *Ty = OpVal->getType();
         uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
         unsigned Align  = TLI.getTargetData()->getPrefTypeAlignment(Ty);
         MachineFunction &MF = DAG.getMachineFunction();
@@ -6111,7 +6320,7 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
 /// FIXME: When all targets are
 /// migrated to using LowerCall, this hook should be integrated into SDISel.
 std::pair<SDValue, SDValue>
-TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
+TargetLowering::LowerCallTo(SDValue Chain, Type *RetTy,
                             bool RetSExt, bool RetZExt, bool isVarArg,
                             bool isInreg, unsigned NumFixedArgs,
                             CallingConv::ID CallConv, bool isTailCall,
@@ -6128,7 +6337,7 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
-      const Type *ArgTy = VT.getTypeForEVT(RetTy->getContext());
+      Type *ArgTy = VT.getTypeForEVT(RetTy->getContext());
       SDValue Op = SDValue(Args[i].Node.getNode(),
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
@@ -6145,8 +6354,8 @@ TargetLowering::LowerCallTo(SDValue Chain, const Type *RetTy,
         Flags.setSRet();
       if (Args[i].isByVal) {
         Flags.setByVal();
-        const PointerType *Ty = cast<PointerType>(Args[i].Ty);
-        const Type *ElementTy = Ty->getElementType();
+        PointerType *Ty = cast<PointerType>(Args[i].Ty);
+        Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(getTargetData()->getTypeAllocSize(ElementTy));
         // For ByVal, alignment should come from FE.  BE will guess if this
         // info is not there but there are cases it cannot get right.
@@ -6356,7 +6565,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
-      const Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+      Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
       ISD::ArgFlagsTy Flags;
       unsigned OriginalAlignment =
         TD->getABITypeAlignment(ArgTy);
@@ -6371,8 +6580,8 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
         Flags.setSRet();
       if (F.paramHasAttr(Idx, Attribute::ByVal)) {
         Flags.setByVal();
-        const PointerType *Ty = cast<PointerType>(I->getType());
-        const Type *ElementTy = Ty->getElementType();
+        PointerType *Ty = cast<PointerType>(I->getType());
+        Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(TD->getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
         // this info is not there but there are cases it cannot get right.
@@ -6487,15 +6696,22 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     if (ArgValues.empty())
       continue;
 
-    // Note down frame index for byval arguments.
-    if (I->hasByValAttr())
-      if (FrameIndexSDNode *FI =
-          dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
-        FuncInfo->setByValArgumentFrameIndex(I, FI->getIndex());
+    // Note down frame index.
+    if (FrameIndexSDNode *FI =
+	dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+      FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
                                      SDB->getCurDebugLoc());
+
     SDB->setValue(I, Res);
+    if (!EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
+      if (LoadSDNode *LNode = 
+          dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
+        if (FrameIndexSDNode *FI =
+            dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
+        FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
+    }
 
     // If this argument is live outside of the entry block, insert a copy from
     // wherever we got it to the vreg that other BB's will reference it as.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index a0884eb..0a21ca3 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -132,10 +132,13 @@ private:
     Constant* Low;
     Constant* High;
     MachineBasicBlock* BB;
+    uint32_t ExtraWeight;
+
+    Case() : Low(0), High(0), BB(0), ExtraWeight(0) { }
+    Case(Constant* low, Constant* high, MachineBasicBlock* bb,
+         uint32_t extraweight) : Low(low), High(high), BB(bb),
+         ExtraWeight(extraweight) { }
 
-    Case() : Low(0), High(0), BB(0) { }
-    Case(Constant* low, Constant* high, MachineBasicBlock* bb) :
-      Low(low), High(high), BB(bb) { }
     APInt size() const {
       const APInt &rHigh = cast<ConstantInt>(High)->getValue();
       const APInt &rLow  = cast<ConstantInt>(Low)->getValue();
@@ -203,20 +206,30 @@ private:
     CaseBlock(ISD::CondCode cc, const Value *cmplhs, const Value *cmprhs,
               const Value *cmpmiddle,
               MachineBasicBlock *truebb, MachineBasicBlock *falsebb,
-              MachineBasicBlock *me)
+              MachineBasicBlock *me,
+              uint32_t trueweight = 0, uint32_t falseweight = 0)
       : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs),
-        TrueBB(truebb), FalseBB(falsebb), ThisBB(me) {}
+        TrueBB(truebb), FalseBB(falsebb), ThisBB(me),
+        TrueWeight(trueweight), FalseWeight(falseweight) { }
+
     // CC - the condition code to use for the case block's setcc node
     ISD::CondCode CC;
+
     // CmpLHS/CmpRHS/CmpMHS - The LHS/MHS/RHS of the comparison to emit.
     // Emit by default LHS op RHS. MHS is used for range comparisons:
     // If MHS is not null: (LHS <= MHS) and (MHS <= RHS).
     const Value *CmpLHS, *CmpMHS, *CmpRHS;
+
     // TrueBB/FalseBB - the block to branch to if the setcc is true/false.
     MachineBasicBlock *TrueBB, *FalseBB;
+
     // ThisBB - the block into which to emit the code for the setcc and branches
     MachineBasicBlock *ThisBB;
+
+    // TrueWeight/FalseWeight - branch weights.
+    uint32_t TrueWeight, FalseWeight;
   };
+
   struct JumpTable {
     JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
               MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
@@ -307,6 +320,9 @@ public:
   /// GFI - Garbage collection metadata for the function.
   GCFunctionInfo *GFI;
 
+  /// LPadToCallSiteMap - Map a landing pad to the call site indexes.
+  DenseMap<MachineBasicBlock*, SmallVector<unsigned, 4> > LPadToCallSiteMap;
+
   /// HasTailCall - This is set to true if a call in the current
   /// block has been translated as a tail call. In this case,
   /// no subsequent DAG nodes should be created.
@@ -436,7 +452,8 @@ private:
                                 MachineBasicBlock *SwitchBB);
 
   uint32_t getEdgeWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst);
-  void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst);
+  void addSuccessorWithWeight(MachineBasicBlock *Src, MachineBasicBlock *Dst,
+                              uint32_t Weight = 0);
 public:
   void visitSwitchCase(CaseBlock &CB,
                        MachineBasicBlock *SwitchBB);
@@ -453,6 +470,7 @@ public:
 private:
   // These all get lowered before this pass.
   void visitInvoke(const InvokeInst &I);
+  void visitResume(const ResumeInst &I);
   void visitUnwind(const UnwindInst &I);
 
   void visitBinary(const User &I, unsigned OpCode);
@@ -497,6 +515,7 @@ private:
 
   void visitExtractValue(const ExtractValueInst &I);
   void visitInsertValue(const InsertValueInst &I);
+  void visitLandingPad(const LandingPadInst &I);
 
   void visitGetElementPtr(const User &I);
   void visitSelect(const User &I);
@@ -504,10 +523,15 @@ private:
   void visitAlloca(const AllocaInst &I);
   void visitLoad(const LoadInst &I);
   void visitStore(const StoreInst &I);
+  void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
+  void visitAtomicRMW(const AtomicRMWInst &I);
+  void visitFence(const FenceInst &I);
   void visitPHI(const PHINode &I);
   void visitCall(const CallInst &I);
   bool visitMemCmpCall(const CallInst &I);
-  
+  void visitAtomicLoad(const LoadInst &I);
+  void visitAtomicStore(const StoreInst &I);
+
   void visitInlineAsm(ImmutableCallSite CS);
   const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
@@ -531,7 +555,6 @@ private:
     llvm_unreachable("UserOp2 should not exist at instruction selection time!");
   }
   
-  const char *implVisitBinaryAtomic(const CallInst& I, ISD::NodeType Op);
   const char *implVisitAluOverflow(const CallInst &I, ISD::NodeType Op);
 
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 87bb296..68b9146 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -177,6 +177,13 @@ TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   return 0;
 }
 
+void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                   SDNode *Node) const {
+  assert(!MI->getDesc().hasPostISelHook() &&
+         "If a target marks an instruction with 'hasPostISelHook', "
+         "it must implement TargetLowering::AdjustInstrPostInstrSelection!");
+}
+
 //===----------------------------------------------------------------------===//
 // SelectionDAGISel code
 //===----------------------------------------------------------------------===//
@@ -463,6 +470,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     GroupName = "Instruction Selection and Scheduling";
   std::string BlockName;
   int BlockNumber = -1;
+  (void)BlockNumber;
 #ifdef NDEBUG
   if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
       ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
@@ -677,21 +685,26 @@ void SelectionDAGISel::DoInstructionSelection() {
 /// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
 /// do other setup for EH landing-pad blocks.
 void SelectionDAGISel::PrepareEHLandingPad() {
+  MachineBasicBlock *MBB = FuncInfo->MBB;
+
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
-  MCSymbol *Label = MF->getMMI().addLandingPad(FuncInfo->MBB);
+  MCSymbol *Label = MF->getMMI().addLandingPad(MBB);
 
+  // Assign the call site to the landing pad's begin label.
+  MF->getMMI().setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+    
   const MCInstrDesc &II = TM.getInstrInfo()->get(TargetOpcode::EH_LABEL);
-  BuildMI(*FuncInfo->MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
+  BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
   // Mark exception register as live in.
   unsigned Reg = TLI.getExceptionAddressRegister();
-  if (Reg) FuncInfo->MBB->addLiveIn(Reg);
+  if (Reg) MBB->addLiveIn(Reg);
 
   // Mark exception selector register as live in.
   Reg = TLI.getExceptionSelectorRegister();
-  if (Reg) FuncInfo->MBB->addLiveIn(Reg);
+  if (Reg) MBB->addLiveIn(Reg);
 
   // FIXME: Hack around an exception handling flaw (PR1508): the personality
   // function and list of typeids logically belong to the invoke (or, if you
@@ -704,7 +717,7 @@ void SelectionDAGISel::PrepareEHLandingPad() {
   // in exceptions not being caught because no typeids are associated with
   // the invoke.  This may not be the only way things can go wrong, but it
   // is the only way we try to work around for the moment.
-  const BasicBlock *LLVMBB = FuncInfo->MBB->getBasicBlock();
+  const BasicBlock *LLVMBB = MBB->getBasicBlock();
   const BranchInst *Br = dyn_cast<BranchInst>(LLVMBB->getTerminator());
 
   if (Br && Br->isUnconditional()) { // Critical edge?
@@ -719,8 +732,6 @@ void SelectionDAGISel::PrepareEHLandingPad() {
   }
 }
 
-
-
 /// TryToFoldFastISelLoad - We're checking to see if we can fold the specified
 /// load into the specified FoldInst.  Note that we could have a sequence where
 /// multiple LLVM IR instructions are folded into the same machineinstr.  For
@@ -741,7 +752,7 @@ bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI,
   // isn't one of the folded instructions, then we can't succeed here.  Handle
   // this by scanning the single-use users of the load until we get to FoldInst.
   unsigned MaxUsers = 6;  // Don't scan down huge single-use chains of instrs.
-  
+
   const Instruction *TheUser = LI->use_back();
   while (TheUser != FoldInst &&   // Scan up until we find FoldInst.
          // Stay in the right block.
@@ -750,10 +761,15 @@ bool SelectionDAGISel::TryToFoldFastISelLoad(const LoadInst *LI,
     // If there are multiple or no uses of this instruction, then bail out.
     if (!TheUser->hasOneUse())
       return false;
-    
+
     TheUser = TheUser->use_back();
   }
-  
+
+  // If we didn't find the fold instruction, then we failed to collapse the
+  // sequence.
+  if (TheUser != FoldInst)
+    return false;
+
   // Don't try to fold volatile loads.  Target has to deal with alignment
   // constraints.
   if (LI->isVolatile()) return false;
@@ -802,6 +818,7 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,
   return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
          !isa<TerminatorInst>(I) && // Terminators aren't folded.
          !isa<DbgInfoIntrinsic>(I) &&  // Debug instructions aren't folded.
+         !isa<LandingPadInst>(I) &&    // Landingpad instructions aren't folded.
          !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
 }
 
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 2626ac3..907d8d9 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -317,7 +317,7 @@ static void InitLibcallNames(const char **Names) {
   Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
   Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
   Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
-  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and-xor_4";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
   Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
   Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
   Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
@@ -609,6 +609,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   ExceptionPointerRegister = 0;
   ExceptionSelectorRegister = 0;
   BooleanContents = UndefinedBooleanContent;
+  BooleanVectorContents = UndefinedBooleanContent;
   SchedPreferenceInfo = Sched::Latency;
   JumpBufSize = 0;
   JumpBufAlignment = 0;
@@ -617,6 +618,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   PrefLoopAlignment = 0;
   MinStackArgumentAlignment = 1;
   ShouldFoldAtomicFences = false;
+  InsertFencesForAtomic = false;
 
   InitLibcallNames(LibcallRoutineNames);
   InitCmpLibcallCCs(CmpLibcallCCs);
@@ -914,7 +916,8 @@ const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
 }
 
 
-MVT::SimpleValueType TargetLowering::getSetCCResultType(EVT VT) const {
+EVT TargetLowering::getSetCCResultType(EVT VT) const {
+  assert(!VT.isVector() && "No default SetCC type for vectors!");
   return PointerTy.SimpleTy;
 }
 
@@ -996,7 +999,7 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
 /// type of the given function.  This does not require a DAG or a return value,
 /// and is suitable for use before any DAGs for the function are constructed.
 /// TODO: Move this out of TargetLowering.cpp.
-void llvm::GetReturnInfo(const Type* ReturnType, Attributes attr,
+void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
                          const TargetLowering &TLI,
                          SmallVectorImpl<uint64_t> *Offsets) {
@@ -1054,7 +1057,7 @@ void llvm::GetReturnInfo(const Type* ReturnType, Attributes attr,
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.  This is the actual
 /// alignment, not its logarithm.
-unsigned TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned TargetLowering::getByValTypeAlignment(Type *Ty) const {
   return TD->getCallFrameTypeAlignment(Ty);
 }
 
@@ -1764,17 +1767,16 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     break;
   }
   case ISD::AssertZext: {
-    // Demand all the bits of the input that are demanded in the output.
-    // The low bits are obvious; the high bits are demanded because we're
-    // asserting that they're zero here.
-    if (SimplifyDemandedBits(Op.getOperand(0), NewMask,
+    // AssertZext demands all of the high bits, plus any of the low bits
+    // demanded by its users.
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth,
+                                        VT.getSizeInBits());
+    if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask,
                              KnownZero, KnownOne, TLO, Depth+1))
       return true;
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 
-    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-    APInt InMask = APInt::getLowBitsSet(BitWidth,
-                                        VT.getSizeInBits());
     KnownZero |= ~InMask & NewMask;
     break;
   }
@@ -2191,7 +2193,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         }
       } else if (N1C->getAPIntValue() == 1 &&
                  (VT == MVT::i1 ||
-                  getBooleanContents() == ZeroOrOneBooleanContent)) {
+                  getBooleanContents(false) == ZeroOrOneBooleanContent)) {
         SDValue Op0 = N0;
         if (Op0.getOpcode() == ISD::TRUNCATE)
           Op0 = Op0.getOperand(0);
@@ -2758,16 +2760,8 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
 
     // If none of the value types for this register class are valid, we
     // can't use it.  For example, 64-bit reg classes on 32-bit targets.
-    bool isLegal = false;
-    for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
-         I != E; ++I) {
-      if (isTypeLegal(*I)) {
-        isLegal = true;
-        break;
-      }
-    }
-
-    if (!isLegal) continue;
+    if (!isLegalRC(RC))
+      continue;
 
     for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
          I != E; ++I) {
@@ -2840,7 +2834,7 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
       // corresponding argument.
       assert(!CS.getType()->isVoidTy() &&
              "Bad inline asm!");
-      if (const StructType *STy = dyn_cast<StructType>(CS.getType())) {
+      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
         OpInfo.ConstraintVT = getValueType(STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
@@ -2857,16 +2851,16 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
     }
 
     if (OpInfo.CallOperandVal) {
-      const llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
+      llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
       if (OpInfo.isIndirect) {
-        const llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+        llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
         if (!PtrTy)
           report_fatal_error("Indirect operand for inline asm not a pointer!");
         OpTy = PtrTy->getElementType();
       }
 
       // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
-      if (const StructType *STy = dyn_cast<StructType>(OpTy))
+      if (StructType *STy = dyn_cast<StructType>(OpTy))
         if (STy->getNumElements() == 1)
           OpTy = STy->getElementType(0);
 
@@ -3187,7 +3181,7 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                           const Type *Ty) const {
+                                           Type *Ty) const {
   // The default implementation of this implements a conservative RISCy, r+r and
   // r+i addr mode.
 
diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index 5a253a4..2609256 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp
@@ -61,7 +61,7 @@ namespace {
   private:
     bool IsNullValue(Value *V);
     Constant *GetFrameMap(Function &F);
-    const Type* GetConcreteStackEntryType(Function &F);
+    Type* GetConcreteStackEntryType(Function &F);
     void CollectRoots(Function &F);
     static GetElementPtrInst *CreateGEP(LLVMContext &Context, 
                                         IRBuilder<> &B, Value *BasePtr,
@@ -109,13 +109,15 @@ namespace {
         State = 1;
 
       case 1:
-        // Find all 'return' and 'unwind' instructions.
+        // Find all 'return', 'resume', and 'unwind' instructions.
         while (StateBB != StateE) {
           BasicBlock *CurBB = StateBB++;
 
-          // Branches and invokes do not escape, only unwind and return do.
+          // Branches and invokes do not escape, only unwind, resume, and return
+          // do.
           TerminatorInst *TI = CurBB->getTerminator();
-          if (!isa<UnwindInst>(TI) && !isa<ReturnInst>(TI))
+          if (!isa<UnwindInst>(TI) && !isa<ReturnInst>(TI) &&
+              !isa<ResumeInst>(TI))
             continue;
 
           Builder.SetInsertPoint(TI->getParent(), TI);
@@ -139,9 +141,19 @@ namespace {
           return 0;
 
         // Create a cleanup block.
-        BasicBlock *CleanupBB = BasicBlock::Create(F.getContext(),
-                                                   CleanupBBName, &F);
-        UnwindInst *UI = new UnwindInst(F.getContext(), CleanupBB);
+        LLVMContext &C = F.getContext();
+        BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+        Type *ExnTy = StructType::get(Type::getInt8PtrTy(C),
+                                      Type::getInt32Ty(C), NULL);
+        Constant *PersFn =
+          F.getParent()->
+          getOrInsertFunction("__gcc_personality_v0",
+                              FunctionType::get(Type::getInt32Ty(C), true));
+        LandingPadInst *LPad = LandingPadInst::Create(ExnTy, PersFn, 1,
+                                                      "cleanup.lpad",
+                                                      CleanupBB);
+        LPad->setCleanup(true);
+        ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
 
         // Transform the 'call' instructions into 'invoke's branching to the
         // cleanup block. Go in reverse order to make prettier BB names.
@@ -172,7 +184,7 @@ namespace {
           delete CI;
         }
 
-        Builder.SetInsertPoint(UI->getParent(), UI);
+        Builder.SetInsertPoint(RI->getParent(), RI);
         return &Builder;
       }
     }
@@ -190,7 +202,7 @@ ShadowStackGC::ShadowStackGC() : Head(0), StackEntryTy(0) {
 
 Constant *ShadowStackGC::GetFrameMap(Function &F) {
   // doInitialization creates the abstract type of this value.
-  const Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
+  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
 
   // Truncate the ShadowStackDescriptor if some metadata is null.
   unsigned NumMeta = 0;
@@ -203,7 +215,7 @@ Constant *ShadowStackGC::GetFrameMap(Function &F) {
   }
   Metadata.resize(NumMeta);
 
-  const Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
   
   Constant *BaseElts[] = {
     ConstantInt::get(Int32Ty, Roots.size(), false),
@@ -216,7 +228,7 @@ Constant *ShadowStackGC::GetFrameMap(Function &F) {
   };
 
   Type *EltTys[] = { DescriptorElts[0]->getType(),DescriptorElts[1]->getType()};
-  StructType *STy = StructType::createNamed("gc_map."+utostr(NumMeta), EltTys);
+  StructType *STy = StructType::create(EltTys, "gc_map."+utostr(NumMeta));
   
   Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
 
@@ -241,17 +253,17 @@ Constant *ShadowStackGC::GetFrameMap(Function &F) {
                           ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
                           ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)
                           };
-  return ConstantExpr::getGetElementPtr(GV, GEPIndices, 2);
+  return ConstantExpr::getGetElementPtr(GV, GEPIndices);
 }
 
-const Type* ShadowStackGC::GetConcreteStackEntryType(Function &F) {
+Type* ShadowStackGC::GetConcreteStackEntryType(Function &F) {
   // doInitialization creates the generic version of this type.
   std::vector<Type*> EltTys;
   EltTys.push_back(StackEntryTy);
   for (size_t I = 0; I != Roots.size(); I++)
     EltTys.push_back(Roots[I].second->getAllocatedType());
   
-  return StructType::createNamed("gc_stackentry."+F.getName().str(), EltTys);
+  return StructType::create(EltTys, "gc_stackentry."+F.getName().str());
 }
 
 /// doInitialization - If this module uses the GC intrinsics, find them now. If
@@ -267,7 +279,7 @@ bool ShadowStackGC::initializeCustomLowering(Module &M) {
   EltTys.push_back(Type::getInt32Ty(M.getContext()));
   // Specifies length of variable length array. 
   EltTys.push_back(Type::getInt32Ty(M.getContext()));
-  FrameMapTy = StructType::createNamed("gc_map", EltTys);
+  FrameMapTy = StructType::create(EltTys, "gc_map");
   PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
 
   // struct StackEntry {
@@ -276,13 +288,13 @@ bool ShadowStackGC::initializeCustomLowering(Module &M) {
   //   void *Roots[];          // Stack roots (in-place array, so we pretend).
   // };
   
-  StackEntryTy = StructType::createNamed(M.getContext(), "gc_stackentry");
+  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
   
   EltTys.clear();
   EltTys.push_back(PointerType::getUnqual(StackEntryTy));
   EltTys.push_back(FrameMapPtrTy);
   StackEntryTy->setBody(EltTys);
-  const PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
+  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
 
   // Get the root chain if it already exists.
   Head = M.getGlobalVariable("llvm_gc_root_chain");
@@ -340,7 +352,7 @@ ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
   Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
                        ConstantInt::get(Type::getInt32Ty(Context), Idx),
                        ConstantInt::get(Type::getInt32Ty(Context), Idx2) };
-  Value* Val = B.CreateGEP(BasePtr, Indices, Indices + 3, Name);
+  Value* Val = B.CreateGEP(BasePtr, Indices, Name);
 
   assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
 
@@ -352,7 +364,7 @@ ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
                          int Idx, const char *Name) {
   Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
                        ConstantInt::get(Type::getInt32Ty(Context), Idx) };
-  Value *Val = B.CreateGEP(BasePtr, Indices, Indices + 2, Name);
+  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
 
   assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
 
@@ -373,7 +385,7 @@ bool ShadowStackGC::performCustomLowering(Function &F) {
 
   // Build the constant map and figure the type of the shadow stack entry.
   Value *FrameMap = GetFrameMap(F);
-  const Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
+  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
 
   // Build the shadow stack entry at the very start of the function.
   BasicBlock::iterator IP = F.getEntryBlock().begin();
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 65a33da..ded2459d 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -21,26 +21,31 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include <set>
 using namespace llvm;
 
+static cl::opt<bool> DisableOldSjLjEH("disable-old-sjlj-eh", cl::Hidden,
+    cl::desc("Disable the old SjLj EH preparation pass"));
+
 STATISTIC(NumInvokes, "Number of invokes replaced");
 STATISTIC(NumUnwinds, "Number of unwinds replaced");
 STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
   class SjLjEHPass : public FunctionPass {
-
     const TargetLowering *TLI;
-
-    const Type *FunctionContextTy;
+    Type *FunctionContextTy;
     Constant *RegisterFn;
     Constant *UnregisterFn;
     Constant *BuiltinSetjmpFn;
@@ -53,8 +58,9 @@ namespace {
     Constant *ExceptionFn;
     Constant *CallSiteFn;
     Constant *DispatchSetupFn;
-
+    Constant *FuncCtxFn;
     Value *CallSite;
+    DenseMap<InvokeInst*, BasicBlock*> LPadSuccMap;
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit SjLjEHPass(const TargetLowering *tli = NULL)
@@ -62,16 +68,22 @@ namespace {
     bool doInitialization(Module &M);
     bool runOnFunction(Function &F);
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const { }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
     const char *getPassName() const {
       return "SJLJ Exception Handling preparation";
     }
 
   private:
+    bool setupEntryBlockAndCallSites(Function &F);
+    Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads);
+    void lowerIncomingArguments(Function &F);
+    void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst*> Invokes);
+
     void insertCallSiteStore(Instruction *I, int Number, Value *CallSite);
     void markInvokeCallSite(InvokeInst *II, int InvokeNo, Value *CallSite,
                             SwitchInst *CatchSwitch);
     void splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes);
+    void splitLandingPad(InvokeInst *II);
     bool insertSjLjEHSupport(Function &F);
   };
 } // end anonymous namespace
@@ -116,6 +128,7 @@ bool SjLjEHPass::doInitialization(Module &M) {
   CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
   DispatchSetupFn
     = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_dispatch_setup);
+  FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
   PersonalityFn = 0;
 
   return true;
@@ -131,6 +144,42 @@ void SjLjEHPass::insertCallSiteStore(Instruction *I, int Number,
   new StoreInst(CallSiteNoC, CallSite, true, I);  // volatile
 }
 
+/// splitLandingPad - Split a landing pad. This takes considerable care because
+/// of PHIs and other nasties. The problem is that the jump table needs to jump
+/// to the landing pad block. However, the landing pad block can be jumped to
+/// only by an invoke instruction. So we clone the landingpad instruction into
+/// its own basic block, have the invoke jump to there. The landingpad
+/// instruction's basic block's successor is now the target for the jump table.
+///
+/// But because of PHI nodes, we need to create another basic block for the jump
+/// table to jump to. This is definitely a hack, because the values for the PHI
+/// nodes may not be defined on the edge from the jump table. But that's okay,
+/// because the jump table is simply a construct to mimic what is happening in
+/// the CFG. So the values are mysteriously there, even though there is no value
+/// for the PHI from the jump table's edge (hence calling this a hack).
+void SjLjEHPass::splitLandingPad(InvokeInst *II) {
+  SmallVector<BasicBlock*, 2> NewBBs;
+  SplitLandingPadPredecessors(II->getUnwindDest(), II->getParent(),
+                              ".1", ".2", this, NewBBs);
+
+  // Create an empty block so that the jump table has something to jump to
+  // which doesn't have any PHI nodes.
+  BasicBlock *LPad = NewBBs[0];
+  BasicBlock *Succ = *succ_begin(LPad);
+  BasicBlock *JumpTo = BasicBlock::Create(II->getContext(), "jt.land",
+                                          LPad->getParent(), Succ);
+  LPad->getTerminator()->eraseFromParent();
+  BranchInst::Create(JumpTo, LPad);
+  BranchInst::Create(Succ, JumpTo);
+  LPadSuccMap[II] = JumpTo;
+
+  for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    Value *Val = PN->removeIncomingValue(LPad, false);
+    PN->addIncoming(Val, JumpTo);
+  }
+}
+
 /// markInvokeCallSite - Insert code to mark the call_site for this invoke
 void SjLjEHPass::markInvokeCallSite(InvokeInst *II, int InvokeNo,
                                     Value *CallSite,
@@ -140,11 +189,15 @@ void SjLjEHPass::markInvokeCallSite(InvokeInst *II, int InvokeNo,
   // The runtime comes back to the dispatcher with the call_site - 1 in
   // the context. Odd, but there it is.
   ConstantInt *SwitchValC = ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                            InvokeNo - 1);
+                                             InvokeNo - 1);
 
   // If the unwind edge has phi nodes, split the edge.
   if (isa<PHINode>(II->getUnwindDest()->begin())) {
-    SplitCriticalEdge(II, 1, this);
+    // FIXME: New EH - This if-condition will be always true in the new scheme.
+    if (II->getUnwindDest()->isLandingPad())
+      splitLandingPad(II);
+    else
+      SplitCriticalEdge(II, 1, this);
 
     // If there are any phi nodes left, they must have a single predecessor.
     while (PHINode *PN = dyn_cast<PHINode>(II->getUnwindDest()->begin())) {
@@ -161,7 +214,12 @@ void SjLjEHPass::markInvokeCallSite(InvokeInst *II, int InvokeNo,
   CallInst::Create(CallSiteFn, CallSiteNoC, "", II);
 
   // Add a switch case to our unwind block.
-  CatchSwitch->addCase(SwitchValC, II->getUnwindDest());
+  if (BasicBlock *SuccBB = LPadSuccMap[II]) {
+    CatchSwitch->addCase(SwitchValC, SuccBB);
+  } else {
+    CatchSwitch->addCase(SwitchValC, II->getUnwindDest());
+  }
+
   // We still want this to look like an invoke so we emit the LSDA properly,
   // so we don't transform the invoke into a call here.
 }
@@ -187,10 +245,16 @@ splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
   for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
     InvokeInst *II = Invokes[i];
     SplitCriticalEdge(II, 0, this);
-    SplitCriticalEdge(II, 1, this);
+
+    // FIXME: New EH - This if-condition will be always true in the new scheme.
+    if (II->getUnwindDest()->isLandingPad())
+      splitLandingPad(II);
+    else
+      SplitCriticalEdge(II, 1, this);
+
     assert(!isa<PHINode>(II->getNormalDest()) &&
            !isa<PHINode>(II->getUnwindDest()) &&
-           "critical edge splitting left single entry phi nodes?");
+           "Critical edge splitting left single entry phi nodes?");
   }
 
   Function *F = Invokes.back()->getParent()->getParent();
@@ -204,7 +268,7 @@ splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
     ++AfterAllocaInsertPt;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
        AI != E; ++AI) {
-    const Type *Ty = AI->getType();
+    Type *Ty = AI->getType();
     // Aggregate types can't be cast, but are legal argument types, so we have
     // to handle them differently. We use an extract/insert pair as a
     // lightweight method to achieve the same goal.
@@ -283,9 +347,8 @@ splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
       bool NeedsSpill = false;
       for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
         BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
-        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
+        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock))
           NeedsSpill = true;
-        }
       }
 
       // If we decided we need a spill, do it.
@@ -299,6 +362,44 @@ splitLiveRangesAcrossInvokes(SmallVector<InvokeInst*,16> &Invokes) {
     }
 }
 
+/// CreateLandingPadLoad - Load the exception handling values and insert them
+/// into a structure.
+static Instruction *CreateLandingPadLoad(Function &F, Value *ExnAddr,
+                                         Value *SelAddr,
+                                         BasicBlock::iterator InsertPt) {
+  Value *Exn = new LoadInst(ExnAddr, "exn", false,
+                            InsertPt);
+  Type *Ty = Type::getInt8PtrTy(F.getContext());
+  Exn = CastInst::Create(Instruction::IntToPtr, Exn, Ty, "", InsertPt);
+  Value *Sel = new LoadInst(SelAddr, "sel", false, InsertPt);
+
+  Ty = StructType::get(Exn->getType(), Sel->getType(), NULL);
+  InsertValueInst *LPadVal = InsertValueInst::Create(llvm::UndefValue::get(Ty),
+                                                     Exn, 0,
+                                                     "lpad.val", InsertPt);
+  return InsertValueInst::Create(LPadVal, Sel, 1, "lpad.val", InsertPt);
+}
+
+/// ReplaceLandingPadVal - Replace the landingpad instruction's value with a
+/// load from the stored values (via CreateLandingPadLoad). This looks through
+/// PHI nodes, and removes them if they are dead.
+static void ReplaceLandingPadVal(Function &F, Instruction *Inst, Value *ExnAddr,
+                                 Value *SelAddr) {
+  if (Inst->use_empty()) return;
+
+  while (!Inst->use_empty()) {
+    Instruction *I = cast<Instruction>(Inst->use_back());
+
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      ReplaceLandingPadVal(F, PN, ExnAddr, SelAddr);
+      if (PN->use_empty()) PN->eraseFromParent();
+      continue;
+    }
+
+    I->replaceUsesOfWith(Inst, CreateLandingPadLoad(F, ExnAddr, SelAddr, I));
+  }
+}
+
 bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   SmallVector<ReturnInst*,16> Returns;
   SmallVector<UnwindInst*,16> Unwinds;
@@ -337,10 +438,23 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   SmallVector<CallInst*,16> EH_Exceptions;
   SmallVector<Instruction*,16> JmpbufUpdatePoints;
 
-  // Note: Skip the entry block since there's nothing there that interests
-  // us. eh.selector and eh.exception shouldn't ever be there, and we
-  // want to disregard any allocas that are there.
-  for (Function::iterator BB = F.begin(), E = F.end(); ++BB != E;) {
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    // Note: Skip the entry block since there's nothing there that interests
+    // us. eh.selector and eh.exception shouldn't ever be there, and we
+    // want to disregard any allocas that are there.
+    // 
+    // FIXME: This is awkward. The new EH scheme won't need to skip the entry
+    //        block.
+    if (BB == F.begin()) {
+      if (InvokeInst *II = dyn_cast<InvokeInst>(F.begin()->getTerminator())) {
+        // FIXME: This will be always non-NULL in the new EH.
+        if (LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst())
+          if (!PersonalityFn) PersonalityFn = LPI->getPersonalityFn();
+      }
+
+      continue;
+    }
+
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
       if (CallInst *CI = dyn_cast<CallInst>(I)) {
         if (CI->getCalledFunction() == SelectorFn) {
@@ -353,6 +467,10 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
         }
       } else if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
         JmpbufUpdatePoints.push_back(AI);
+      } else if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+        // FIXME: This will be always non-NULL in the new EH.
+        if (LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst())
+          if (!PersonalityFn) PersonalityFn = LPI->getPersonalityFn();
       }
     }
   }
@@ -371,6 +489,16 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   // invoke's.
   splitLiveRangesAcrossInvokes(Invokes);
 
+
+  SmallVector<LandingPadInst*, 16> LandingPads;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      // FIXME: This will be always non-NULL in the new EH.
+      if (LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst())
+        LandingPads.push_back(LPI);
+  }
+
+
   BasicBlock *EntryBB = F.begin();
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function.  This is an
@@ -381,27 +509,25 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
                    "fcn_context", F.begin()->begin());
 
   Value *Idxs[2];
-  const Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
   Value *Zero = ConstantInt::get(Int32Ty, 0);
   // We need to also keep around a reference to the call_site field
   Idxs[0] = Zero;
   Idxs[1] = ConstantInt::get(Int32Ty, 1);
-  CallSite = GetElementPtrInst::Create(FunctionContext, Idxs, Idxs+2,
-                                       "call_site",
+  CallSite = GetElementPtrInst::Create(FunctionContext, Idxs, "call_site",
                                        EntryBB->getTerminator());
 
   // The exception selector comes back in context->data[1]
   Idxs[1] = ConstantInt::get(Int32Ty, 2);
-  Value *FCData = GetElementPtrInst::Create(FunctionContext, Idxs, Idxs+2,
-                                            "fc_data",
+  Value *FCData = GetElementPtrInst::Create(FunctionContext, Idxs, "fc_data",
                                             EntryBB->getTerminator());
   Idxs[1] = ConstantInt::get(Int32Ty, 1);
-  Value *SelectorAddr = GetElementPtrInst::Create(FCData, Idxs, Idxs+2,
+  Value *SelectorAddr = GetElementPtrInst::Create(FCData, Idxs,
                                                   "exc_selector_gep",
                                                   EntryBB->getTerminator());
   // The exception value comes back in context->data[0]
   Idxs[1] = Zero;
-  Value *ExceptionAddr = GetElementPtrInst::Create(FCData, Idxs, Idxs+2,
+  Value *ExceptionAddr = GetElementPtrInst::Create(FCData, Idxs,
                                                    "exception_gep",
                                                    EntryBB->getTerminator());
 
@@ -423,13 +549,16 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
     // instruction hasn't already been removed.
     if (!I->getParent()) continue;
     Value *Val = new LoadInst(ExceptionAddr, "exception", true, I);
-    const Type *Ty = Type::getInt8PtrTy(F.getContext());
+    Type *Ty = Type::getInt8PtrTy(F.getContext());
     Val = CastInst::Create(Instruction::IntToPtr, Val, Ty, "", I);
 
     I->replaceAllUsesWith(Val);
     I->eraseFromParent();
   }
 
+  for (unsigned i = 0, e = LandingPads.size(); i != e; ++i)
+    ReplaceLandingPadVal(F, LandingPads[i], ExceptionAddr, SelectorAddr);
+
   // The entry block changes to have the eh.sjlj.setjmp, with a conditional
   // branch to a dispatch block for non-zero returns. If we return normally,
   // we're not handling an exception and just register the function context and
@@ -466,8 +595,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   Idxs[0] = Zero;
   Idxs[1] = ConstantInt::get(Int32Ty, 4);
   Value *LSDAFieldPtr =
-    GetElementPtrInst::Create(FunctionContext, Idxs, Idxs+2,
-                              "lsda_gep",
+    GetElementPtrInst::Create(FunctionContext, Idxs, "lsda_gep",
                               EntryBB->getTerminator());
   Value *LSDA = CallInst::Create(LSDAAddrFn, "lsda_addr",
                                  EntryBB->getTerminator());
@@ -475,8 +603,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
 
   Idxs[1] = ConstantInt::get(Int32Ty, 3);
   Value *PersonalityFieldPtr =
-    GetElementPtrInst::Create(FunctionContext, Idxs, Idxs+2,
-                              "lsda_gep",
+    GetElementPtrInst::Create(FunctionContext, Idxs, "lsda_gep",
                               EntryBB->getTerminator());
   new StoreInst(PersonalityFn, PersonalityFieldPtr, true,
                 EntryBB->getTerminator());
@@ -484,12 +611,11 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   // Save the frame pointer.
   Idxs[1] = ConstantInt::get(Int32Ty, 5);
   Value *JBufPtr
-    = GetElementPtrInst::Create(FunctionContext, Idxs, Idxs+2,
-                                "jbuf_gep",
+    = GetElementPtrInst::Create(FunctionContext, Idxs, "jbuf_gep",
                                 EntryBB->getTerminator());
   Idxs[1] = ConstantInt::get(Int32Ty, 0);
   Value *FramePtr =
-    GetElementPtrInst::Create(JBufPtr, Idxs, Idxs+2, "jbuf_fp_gep",
+    GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_fp_gep",
                               EntryBB->getTerminator());
 
   Value *Val = CallInst::Create(FrameAddrFn,
@@ -501,7 +627,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   // Save the stack pointer.
   Idxs[1] = ConstantInt::get(Int32Ty, 2);
   Value *StackPtr =
-    GetElementPtrInst::Create(JBufPtr, Idxs, Idxs+2, "jbuf_sp_gep",
+    GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_sp_gep",
                               EntryBB->getTerminator());
 
   Val = CallInst::Create(StackAddrFn, "sp", EntryBB->getTerminator());
@@ -513,7 +639,7 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
                      Type::getInt8PtrTy(F.getContext()), "",
                      EntryBB->getTerminator());
   Value *DispatchVal = CallInst::Create(BuiltinSetjmpFn, SetjmpArg,
-                                        "dispatch",
+                                        "",
                                         EntryBB->getTerminator());
 
   // Add a call to dispatch_setup after the setjmp call. This is expanded to any
@@ -554,6 +680,8 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
         if (Callee != SelectorFn && Callee != ExceptionFn
             && !CI->doesNotThrow())
           insertCallSiteStore(CI, -1, CallSite);
+      } else if (ResumeInst *RI = dyn_cast<ResumeInst>(I)) {
+        insertCallSiteStore(RI, -1, CallSite);
       }
   }
 
@@ -582,7 +710,317 @@ bool SjLjEHPass::insertSjLjEHSupport(Function &F) {
   return true;
 }
 
+/// setupFunctionContext - Allocate the function context on the stack and fill
+/// it with all of the data that we know at this point.
+Value *SjLjEHPass::
+setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
+  BasicBlock *EntryBB = F.begin();
+
+  // Create an alloca for the incoming jump buffer ptr and the new jump buffer
+  // that needs to be restored on all exits from the function. This is an alloca
+  // because the value needs to be added to the global context list.
+  unsigned Align =
+    TLI->getTargetData()->getPrefTypeAlignment(FunctionContextTy);
+  AllocaInst *FuncCtx =
+    new AllocaInst(FunctionContextTy, 0, Align, "fn_context", EntryBB->begin());
+
+  // Fill in the function context structure.
+  Value *Idxs[2];
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  Value *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *One = ConstantInt::get(Int32Ty, 1);
+
+  // Keep around a reference to the call_site field.
+  Idxs[0] = Zero;
+  Idxs[1] = One;
+  CallSite = GetElementPtrInst::Create(FuncCtx, Idxs, "call_site",
+                                       EntryBB->getTerminator());
+
+  // Reference the __data field.
+  Idxs[1] = ConstantInt::get(Int32Ty, 2);
+  Value *FCData = GetElementPtrInst::Create(FuncCtx, Idxs, "__data",
+                                            EntryBB->getTerminator());
+
+  // The exception value comes back in context->__data[0].
+  Idxs[1] = Zero;
+  Value *ExceptionAddr = GetElementPtrInst::Create(FCData, Idxs,
+                                                   "exception_gep",
+                                                   EntryBB->getTerminator());
+
+  // The exception selector comes back in context->__data[1].
+  Idxs[1] = One;
+  Value *SelectorAddr = GetElementPtrInst::Create(FCData, Idxs,
+                                                  "exn_selector_gep",
+                                                  EntryBB->getTerminator());
+
+  for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
+    LandingPadInst *LPI = LPads[I];
+    IRBuilder<> Builder(LPI->getParent()->getFirstInsertionPt());
+
+    Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
+    ExnVal = Builder.CreateIntToPtr(ExnVal, Type::getInt8PtrTy(F.getContext()));
+    Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
+
+    Type *LPadType = LPI->getType();
+    Value *LPadVal = UndefValue::get(LPadType);
+    LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
+    LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
+
+    LPI->replaceAllUsesWith(LPadVal);
+  }
+
+  // Personality function
+  Idxs[1] = ConstantInt::get(Int32Ty, 3);
+  if (!PersonalityFn)
+    PersonalityFn = LPads[0]->getPersonalityFn();
+  Value *PersonalityFieldPtr =
+    GetElementPtrInst::Create(FuncCtx, Idxs, "pers_fn_gep",
+                              EntryBB->getTerminator());
+  new StoreInst(PersonalityFn, PersonalityFieldPtr, true,
+                EntryBB->getTerminator());
+
+  // LSDA address
+  Idxs[1] = ConstantInt::get(Int32Ty, 4);
+  Value *LSDAFieldPtr = GetElementPtrInst::Create(FuncCtx, Idxs, "lsda_gep",
+                                                  EntryBB->getTerminator());
+  Value *LSDA = CallInst::Create(LSDAAddrFn, "lsda_addr",
+                                 EntryBB->getTerminator());
+  new StoreInst(LSDA, LSDAFieldPtr, true, EntryBB->getTerminator());
+
+  return FuncCtx;
+}
+
+/// lowerIncomingArguments - To avoid having to handle incoming arguments
+/// specially, we lower each arg to a copy instruction in the entry block. This
+/// ensures that the argument value itself cannot be live out of the entry
+/// block.
+void SjLjEHPass::lowerIncomingArguments(Function &F) {
+  BasicBlock::iterator AfterAllocaInsPt = F.begin()->begin();
+  while (isa<AllocaInst>(AfterAllocaInsPt) &&
+         isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
+    ++AfterAllocaInsPt;
+
+  for (Function::arg_iterator
+         AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) {
+    Type *Ty = AI->getType();
+
+    // Aggregate types can't be cast, but are legal argument types, so we have
+    // to handle them differently. We use an extract/insert pair as a
+    // lightweight method to achieve the same goal.
+    if (isa<StructType>(Ty) || isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+      Instruction *EI = ExtractValueInst::Create(AI, 0, "", AfterAllocaInsPt);
+      Instruction *NI = InsertValueInst::Create(AI, EI, 0);
+      NI->insertAfter(EI);
+      AI->replaceAllUsesWith(NI);
+
+      // Set the operand of the instructions back to the AllocaInst.
+      EI->setOperand(0, AI);
+      NI->setOperand(0, AI);
+    } else {
+      // This is always a no-op cast because we're casting AI to AI->getType()
+      // so src and destination types are identical. BitCast is the only
+      // possibility.
+      CastInst *NC =
+        new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
+                        AfterAllocaInsPt);
+      AI->replaceAllUsesWith(NC);
+
+      // Set the operand of the cast instruction back to the AllocaInst.
+      // Normally it's forbidden to replace a CastInst's operand because it
+      // could cause the opcode to reflect an illegal conversion. However, we're
+      // replacing it here with the same value it was constructed with.  We do
+      // this because the above replaceAllUsesWith() clobbered the operand, but
+      // we want this one to remain.
+      NC->setOperand(0, AI);
+    }
+  }
+}
+
+/// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
+/// edge and spill them.
+void SjLjEHPass::lowerAcrossUnwindEdges(Function &F,
+                                        ArrayRef<InvokeInst*> Invokes) {
+  // Finally, scan the code looking for instructions with bad live ranges.
+  for (Function::iterator
+         BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator
+           II = BB->begin(), IIE = BB->end(); II != IIE; ++II) {
+      // Ignore obvious cases we don't have to handle. In particular, most
+      // instructions either have no uses or only have a single use inside the
+      // current block. Ignore them quickly.
+      Instruction *Inst = II;
+      if (Inst->use_empty()) continue;
+      if (Inst->hasOneUse() &&
+          cast<Instruction>(Inst->use_back())->getParent() == BB &&
+          !isa<PHINode>(Inst->use_back())) continue;
+
+      // If this is an alloca in the entry block, it's not a real register
+      // value.
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+        if (isa<ConstantInt>(AI->getArraySize()) && BB == F.begin())
+          continue;
+
+      // Avoid iterator invalidation by copying users to a temporary vector.
+      SmallVector<Instruction*, 16> Users;
+      for (Value::use_iterator
+             UI = Inst->use_begin(), E = Inst->use_end(); UI != E; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (User->getParent() != BB || isa<PHINode>(User))
+          Users.push_back(User);
+      }
+
+      // Find all of the blocks that this value is live in.
+      std::set<BasicBlock*> LiveBBs;
+      LiveBBs.insert(Inst->getParent());
+      while (!Users.empty()) {
+        Instruction *U = Users.back();
+        Users.pop_back();
+
+        if (!isa<PHINode>(U)) {
+          MarkBlocksLiveIn(U->getParent(), LiveBBs);
+        } else {
+          // Uses for a PHI node occur in their predecessor block.
+          PHINode *PN = cast<PHINode>(U);
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == Inst)
+              MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
+        }
+      }
+
+      // Now that we know all of the blocks that this thing is live in, see if
+      // it includes any of the unwind locations.
+      bool NeedsSpill = false;
+      for (unsigned i = 0, e = Invokes.size(); i != e; ++i) {
+        BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
+        if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
+          NeedsSpill = true;
+        }
+      }
+
+      // If we decided we need a spill, do it.
+      // FIXME: Spilling this way is overkill, as it forces all uses of
+      // the value to be reloaded from the stack slot, even those that aren't
+      // in the unwind blocks. We should be more selective.
+      if (NeedsSpill) {
+        ++NumSpilled;
+        DemoteRegToStack(*Inst, true);
+      }
+    }
+  }
+}
+
+/// setupEntryBlockAndCallSites - Setup the entry block by creating and filling
+/// the function context and marking the call sites with the appropriate
+/// values. These values are used by the DWARF EH emitter.
+bool SjLjEHPass::setupEntryBlockAndCallSites(Function &F) {
+  SmallVector<ReturnInst*,     16> Returns;
+  SmallVector<InvokeInst*,     16> Invokes;
+  SmallVector<LandingPadInst*, 16> LPads;
+
+  // Look through the terminators of the basic blocks to find invokes.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Invokes.push_back(II);
+      LPads.push_back(II->getUnwindDest()->getLandingPadInst());
+    } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+      Returns.push_back(RI);
+    }
+
+  if (Invokes.empty()) return false;
+
+  lowerIncomingArguments(F);
+  lowerAcrossUnwindEdges(F, Invokes);
+
+  Value *FuncCtx = setupFunctionContext(F, LPads);
+  BasicBlock *EntryBB = F.begin();
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+
+  Value *Idxs[2] = {
+    ConstantInt::get(Int32Ty, 0), 0
+  };
+
+  // Get a reference to the jump buffer.
+  Idxs[1] = ConstantInt::get(Int32Ty, 5);
+  Value *JBufPtr = GetElementPtrInst::Create(FuncCtx, Idxs, "jbuf_gep",
+                                             EntryBB->getTerminator());
+
+  // Save the frame pointer.
+  Idxs[1] = ConstantInt::get(Int32Ty, 0);
+  Value *FramePtr = GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_fp_gep",
+                                              EntryBB->getTerminator());
+
+  Value *Val = CallInst::Create(FrameAddrFn,
+                                ConstantInt::get(Int32Ty, 0),
+                                "fp",
+                                EntryBB->getTerminator());
+  new StoreInst(Val, FramePtr, true, EntryBB->getTerminator());
+
+  // Save the stack pointer.
+  Idxs[1] = ConstantInt::get(Int32Ty, 2);
+  Value *StackPtr = GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_sp_gep",
+                                              EntryBB->getTerminator());
+
+  Val = CallInst::Create(StackAddrFn, "sp", EntryBB->getTerminator());
+  new StoreInst(Val, StackPtr, true, EntryBB->getTerminator());
+
+  // Call the setjmp instrinsic. It fills in the rest of the jmpbuf.
+  Value *SetjmpArg = CastInst::Create(Instruction::BitCast, JBufPtr,
+                                      Type::getInt8PtrTy(F.getContext()), "",
+                                      EntryBB->getTerminator());
+  CallInst::Create(BuiltinSetjmpFn, SetjmpArg, "", EntryBB->getTerminator());
+
+  // Store a pointer to the function context so that the back-end will know
+  // where to look for it.
+  Value *FuncCtxArg = CastInst::Create(Instruction::BitCast, FuncCtx,
+                                       Type::getInt8PtrTy(F.getContext()), "",
+                                       EntryBB->getTerminator());
+  CallInst::Create(FuncCtxFn, FuncCtxArg, "", EntryBB->getTerminator());
+
+  // At this point, we are all set up, update the invoke instructions to mark
+  // their call_site values.
+  for (unsigned I = 0, E = Invokes.size(); I != E; ++I) {
+    insertCallSiteStore(Invokes[I], I + 1, CallSite);
+
+    ConstantInt *CallSiteNum =
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
+
+    // Record the call site value for the back end so it stays associated with
+    // the invoke.
+    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]);
+  }
+
+  // Mark call instructions that aren't nounwind as no-action (call_site ==
+  // -1). Skip the entry block, as prior to then, no function context has been
+  // created for this function and any unexpected exceptions thrown will go
+  // directly to the caller's context, which is what we want anyway, so no need
+  // to do anything here.
+  for (Function::iterator BB = F.begin(), E = F.end(); ++BB != E;)
+    for (BasicBlock::iterator I = BB->begin(), end = BB->end(); I != end; ++I)
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (!CI->doesNotThrow())
+          insertCallSiteStore(CI, -1, CallSite);
+      } else if (ResumeInst *RI = dyn_cast<ResumeInst>(I)) {
+        insertCallSiteStore(RI, -1, CallSite);
+      }
+
+  // Register the function context and make sure it's known to not throw
+  CallInst *Register = CallInst::Create(RegisterFn, FuncCtx, "",
+                                        EntryBB->getTerminator());
+  Register->setDoesNotThrow();
+
+  // Finally, for any returns from this function, if this function contains an
+  // invoke, add a call to unregister the function context.
+  for (unsigned I = 0, E = Returns.size(); I != E; ++I)
+    CallInst::Create(UnregisterFn, FuncCtx, "", Returns[I]);
+
+  return true;
+}
+
 bool SjLjEHPass::runOnFunction(Function &F) {
-  bool Res = insertSjLjEHSupport(F);
+  bool Res = false;
+  if (!DisableOldSjLjEH)
+    Res = insertSjLjEHSupport(F);
+  else
+    Res = setupEntryBlockAndCallSites(F);
   return Res;
 }
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index 6949618..6f33f54 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -220,6 +220,7 @@ void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
       0,           // DontCare,
       1,           // PrefReg,
       -1,          // PrefSpill
+      0,           // PrefBoth
       -HUGE_VALF   // MustSpill
     };
 
@@ -239,6 +240,22 @@ void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
   }
 }
 
+/// addPrefSpill - Same as addConstraints(PrefSpill)
+void SpillPlacement::addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong) {
+  for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I) {
+    float Freq = getBlockFrequency(*I);
+    if (Strong)
+      Freq += Freq;
+    unsigned ib = bundles->getBundle(*I, 0);
+    unsigned ob = bundles->getBundle(*I, 1);
+    activate(ib);
+    activate(ob);
+    nodes[ib].addBias(-Freq, 1);
+    nodes[ob].addBias(-Freq, 0);
+  }
+}
+
 void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
   for (ArrayRef<unsigned>::iterator I = Links.begin(), E = Links.end(); I != E;
        ++I) {
diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h
index 6952ad8..fc412f8 100644
--- a/lib/CodeGen/SpillPlacement.h
+++ b/lib/CodeGen/SpillPlacement.h
@@ -71,6 +71,7 @@ public:
     DontCare,  ///< Block doesn't care / variable not live.
     PrefReg,   ///< Block entry/exit prefers a register.
     PrefSpill, ///< Block entry/exit prefers a stack slot.
+    PrefBoth,  ///< Block entry prefers both register and stack.
     MustSpill  ///< A register is impossible, variable must be spilled.
   };
 
@@ -79,6 +80,11 @@ public:
     unsigned Number;            ///< Basic block number (from MBB::getNumber()).
     BorderConstraint Entry : 8; ///< Constraint on block entry.
     BorderConstraint Exit : 8;  ///< Constraint on block exit.
+
+    /// True when this block changes the value of the live range. This means
+    /// the block has a non-PHI def.  When this is false, a live-in value on
+    /// the stack can be live-out on the stack without inserting a spill.
+    bool ChangesValue;
   };
 
   /// prepare - Reset state and prepare for a new spill placement computation.
@@ -96,6 +102,14 @@ public:
   ///                   live out.
   void addConstraints(ArrayRef<BlockConstraint> LiveBlocks);
 
+  /// addPrefSpill - Add PrefSpill constraints to all blocks listed.  This is
+  /// equivalent to calling addConstraint with identical BlockConstraints with
+  /// Entry = Exit = PrefSpill, and ChangesValue = false.
+  ///
+  /// @param Blocks Array of block numbers that prefer to spill in and out.
+  /// @param Strong When true, double the negative bias for these blocks.
+  void addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong);
+
   /// addLinks - Add transparent blocks with the given numbers.
   void addLinks(ArrayRef<unsigned> Links);
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 761cab7..6362780 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -178,45 +179,55 @@ bool SplitAnalysis::calcLiveBlockInfo() {
         return false;
     } else {
       // This block has uses. Find the first and last uses in the block.
-      BI.FirstUse = *UseI;
-      assert(BI.FirstUse >= Start);
+      BI.FirstInstr = *UseI;
+      assert(BI.FirstInstr >= Start);
       do ++UseI;
       while (UseI != UseE && *UseI < Stop);
-      BI.LastUse = UseI[-1];
-      assert(BI.LastUse < Stop);
+      BI.LastInstr = UseI[-1];
+      assert(BI.LastInstr < Stop);
 
       // LVI is the first live segment overlapping MBB.
       BI.LiveIn = LVI->start <= Start;
 
+      // When not live in, the first use should be a def.
+      if (!BI.LiveIn) {
+        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        assert(LVI->start == BI.FirstInstr && "First instr should be a def");
+        BI.FirstDef = BI.FirstInstr;
+      }
+
       // Look for gaps in the live range.
       BI.LiveOut = true;
       while (LVI->end < Stop) {
         SlotIndex LastStop = LVI->end;
         if (++LVI == LVE || LVI->start >= Stop) {
           BI.LiveOut = false;
-          BI.LastUse = LastStop;
+          BI.LastInstr = LastStop;
           break;
         }
+
         if (LastStop < LVI->start) {
           // There is a gap in the live range. Create duplicate entries for the
           // live-in snippet and the live-out snippet.
           ++NumGapBlocks;
 
           // Push the Live-in part.
-          BI.LiveThrough = false;
           BI.LiveOut = false;
           UseBlocks.push_back(BI);
-          UseBlocks.back().LastUse = LastStop;
+          UseBlocks.back().LastInstr = LastStop;
 
           // Set up BI for the live-out part.
           BI.LiveIn = false;
           BI.LiveOut = true;
-          BI.FirstUse = LVI->start;
+          BI.FirstInstr = BI.FirstDef = LVI->start;
         }
+
+        // A LiveRange that starts in the middle of the block must be a def.
+        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        if (!BI.FirstDef)
+          BI.FirstDef = LVI->start;
       }
 
-      // Don't set LiveThrough when the block has a gap.
-      BI.LiveThrough = BI.LiveIn && BI.LiveOut;
       UseBlocks.push_back(BI);
 
       // LVI is now at LVE or LVI->end >= Stop.
@@ -299,17 +310,21 @@ SplitEditor::SplitEditor(SplitAnalysis &sa,
     TRI(*vrm.getMachineFunction().getTarget().getRegisterInfo()),
     Edit(0),
     OpenIdx(0),
+    SpillMode(SM_Partition),
     RegAssign(Allocator)
 {}
 
-void SplitEditor::reset(LiveRangeEdit &lre) {
-  Edit = &lre;
+void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
+  Edit = &LRE;
+  SpillMode = SM;
   OpenIdx = 0;
   RegAssign.clear();
   Values.clear();
 
-  // We don't need to clear LiveOutCache, only LiveOutSeen entries are read.
-  LiveOutSeen.clear();
+  // Reset the LiveRangeCalc instances needed for this spill mode.
+  LRCalc[0].reset(&VRM.getMachineFunction());
+  if (SpillMode)
+    LRCalc[1].reset(&VRM.getMachineFunction());
 
   // We don't need an AliasAnalysis since we will only be performing
   // cheap-as-a-copy remats anyway.
@@ -340,7 +355,8 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
 
   // Use insert for lookup, so we can add missing values with a second lookup.
   std::pair<ValueMap::iterator, bool> InsP =
-    Values.insert(std::make_pair(std::make_pair(RegIdx, ParentVNI->id), VNI));
+    Values.insert(std::make_pair(std::make_pair(RegIdx, ParentVNI->id),
+                                 ValueForcePair(VNI, false)));
 
   // This was the first time (RegIdx, ParentVNI) was mapped.
   // Keep it as a simple def without any liveness.
@@ -348,11 +364,11 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
     return VNI;
 
   // If the previous value was a simple mapping, add liveness for it now.
-  if (VNInfo *OldVNI = InsP.first->second) {
+  if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
     SlotIndex Def = OldVNI->def;
     LI->addRange(LiveRange(Def, Def.getNextSlot(), OldVNI));
-    // No longer a simple mapping.
-    InsP.first->second = 0;
+    // No longer a simple mapping.  Switch to a complex, non-forced mapping.
+    InsP.first->second = ValueForcePair();
   }
 
   // This is a complex mapping, add liveness for VNI
@@ -362,230 +378,24 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   return VNI;
 }
 
-void SplitEditor::markComplexMapped(unsigned RegIdx, const VNInfo *ParentVNI) {
+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   assert(ParentVNI && "Mapping  NULL value");
-  VNInfo *&VNI = Values[std::make_pair(RegIdx, ParentVNI->id)];
+  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
+  VNInfo *VNI = VFP.getPointer();
 
-  // ParentVNI was either unmapped or already complex mapped. Either way.
-  if (!VNI)
+  // ParentVNI was either unmapped or already complex mapped. Either way, just
+  // set the force bit.
+  if (!VNI) {
+    VFP.setInt(true);
     return;
+  }
 
   // This was previously a single mapping. Make sure the old def is represented
   // by a trivial live range.
   SlotIndex Def = VNI->def;
   Edit->get(RegIdx)->addRange(LiveRange(Def, Def.getNextSlot(), VNI));
-  VNI = 0;
-}
-
-// extendRange - Extend the live range to reach Idx.
-// Potentially create phi-def values.
-void SplitEditor::extendRange(unsigned RegIdx, SlotIndex Idx) {
-  assert(Idx.isValid() && "Invalid SlotIndex");
-  MachineBasicBlock *IdxMBB = LIS.getMBBFromIndex(Idx);
-  assert(IdxMBB && "No MBB at Idx");
-  LiveInterval *LI = Edit->get(RegIdx);
-
-  // Is there a def in the same MBB we can extend?
-  if (LI->extendInBlock(LIS.getMBBStartIdx(IdxMBB), Idx))
-    return;
-
-  // Now for the fun part. We know that ParentVNI potentially has multiple defs,
-  // and we may need to create even more phi-defs to preserve VNInfo SSA form.
-  // Perform a search for all predecessor blocks where we know the dominating
-  // VNInfo.
-  VNInfo *VNI = findReachingDefs(LI, IdxMBB, Idx.getNextSlot());
-
-  // When there were multiple different values, we may need new PHIs.
-  if (!VNI)
-    return updateSSA();
-
-  // Poor man's SSA update for the single-value case.
-  LiveOutPair LOP(VNI, MDT[LIS.getMBBFromIndex(VNI->def)]);
-  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(),
-         E = LiveInBlocks.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I->DomNode->getBlock();
-    SlotIndex Start = LIS.getMBBStartIdx(MBB);
-    if (I->Kill.isValid())
-      LI->addRange(LiveRange(Start, I->Kill, VNI));
-    else {
-      LiveOutCache[MBB] = LOP;
-      LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI));
-    }
-  }
-}
-
-/// findReachingDefs - Search the CFG for known live-out values.
-/// Add required live-in blocks to LiveInBlocks.
-VNInfo *SplitEditor::findReachingDefs(LiveInterval *LI,
-                                      MachineBasicBlock *KillMBB,
-                                      SlotIndex Kill) {
-  // Initialize the live-out cache the first time it is needed.
-  if (LiveOutSeen.empty()) {
-    unsigned N = VRM.getMachineFunction().getNumBlockIDs();
-    LiveOutSeen.resize(N);
-    LiveOutCache.resize(N);
-  }
-
-  // Blocks where LI should be live-in.
-  SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB);
-
-  // Remember if we have seen more than one value.
-  bool UniqueVNI = true;
-  VNInfo *TheVNI = 0;
-
-  // Using LiveOutCache as a visited set, perform a BFS for all reaching defs.
-  for (unsigned i = 0; i != WorkList.size(); ++i) {
-    MachineBasicBlock *MBB = WorkList[i];
-    assert(!MBB->pred_empty() && "Value live-in to entry block?");
-    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-           PE = MBB->pred_end(); PI != PE; ++PI) {
-       MachineBasicBlock *Pred = *PI;
-       LiveOutPair &LOP = LiveOutCache[Pred];
-
-       // Is this a known live-out block?
-       if (LiveOutSeen.test(Pred->getNumber())) {
-         if (VNInfo *VNI = LOP.first) {
-           if (TheVNI && TheVNI != VNI)
-             UniqueVNI = false;
-           TheVNI = VNI;
-         }
-         continue;
-       }
-
-       // First time. LOP is garbage and must be cleared below.
-       LiveOutSeen.set(Pred->getNumber());
-
-       // Does Pred provide a live-out value?
-       SlotIndex Start, Last;
-       tie(Start, Last) = LIS.getSlotIndexes()->getMBBRange(Pred);
-       Last = Last.getPrevSlot();
-       VNInfo *VNI = LI->extendInBlock(Start, Last);
-       LOP.first = VNI;
-       if (VNI) {
-         LOP.second = MDT[LIS.getMBBFromIndex(VNI->def)];
-         if (TheVNI && TheVNI != VNI)
-           UniqueVNI = false;
-         TheVNI = VNI;
-         continue;
-       }
-       LOP.second = 0;
-
-       // No, we need a live-in value for Pred as well
-       if (Pred != KillMBB)
-          WorkList.push_back(Pred);
-       else
-          // Loopback to KillMBB, so value is really live through.
-         Kill = SlotIndex();
-    }
-  }
-
-  // Transfer WorkList to LiveInBlocks in reverse order.
-  // This ordering works best with updateSSA().
-  LiveInBlocks.clear();
-  LiveInBlocks.reserve(WorkList.size());
-  while(!WorkList.empty())
-    LiveInBlocks.push_back(MDT[WorkList.pop_back_val()]);
-
-  // The kill block may not be live-through.
-  assert(LiveInBlocks.back().DomNode->getBlock() == KillMBB);
-  LiveInBlocks.back().Kill = Kill;
-
-  return UniqueVNI ? TheVNI : 0;
-}
-
-void SplitEditor::updateSSA() {
-  // This is essentially the same iterative algorithm that SSAUpdater uses,
-  // except we already have a dominator tree, so we don't have to recompute it.
-  unsigned Changes;
-  do {
-    Changes = 0;
-    // Propagate live-out values down the dominator tree, inserting phi-defs
-    // when necessary.
-    for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(),
-           E = LiveInBlocks.end(); I != E; ++I) {
-      MachineDomTreeNode *Node = I->DomNode;
-      // Skip block if the live-in value has already been determined.
-      if (!Node)
-        continue;
-      MachineBasicBlock *MBB = Node->getBlock();
-      MachineDomTreeNode *IDom = Node->getIDom();
-      LiveOutPair IDomValue;
-
-      // We need a live-in value to a block with no immediate dominator?
-      // This is probably an unreachable block that has survived somehow.
-      bool needPHI = !IDom || !LiveOutSeen.test(IDom->getBlock()->getNumber());
-
-      // IDom dominates all of our predecessors, but it may not be their
-      // immediate dominator. Check if any of them have live-out values that are
-      // properly dominated by IDom. If so, we need a phi-def here.
-      if (!needPHI) {
-        IDomValue = LiveOutCache[IDom->getBlock()];
-        for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-               PE = MBB->pred_end(); PI != PE; ++PI) {
-          LiveOutPair Value = LiveOutCache[*PI];
-          if (!Value.first || Value.first == IDomValue.first)
-            continue;
-          // This predecessor is carrying something other than IDomValue.
-          // It could be because IDomValue hasn't propagated yet, or it could be
-          // because MBB is in the dominance frontier of that value.
-          if (MDT.dominates(IDom, Value.second)) {
-            needPHI = true;
-            break;
-          }
-        }
-      }
-
-      // The value may be live-through even if Kill is set, as can happen when
-      // we are called from extendRange. In that case LiveOutSeen is true, and
-      // LiveOutCache indicates a foreign or missing value.
-      LiveOutPair &LOP = LiveOutCache[MBB];
-
-      // Create a phi-def if required.
-      if (needPHI) {
-        ++Changes;
-        SlotIndex Start = LIS.getMBBStartIdx(MBB);
-        unsigned RegIdx = RegAssign.lookup(Start);
-        LiveInterval *LI = Edit->get(RegIdx);
-        VNInfo *VNI = LI->getNextValue(Start, 0, LIS.getVNInfoAllocator());
-        VNI->setIsPHIDef(true);
-        I->Value = VNI;
-        // This block is done, we know the final value.
-        I->DomNode = 0;
-        if (I->Kill.isValid())
-          LI->addRange(LiveRange(Start, I->Kill, VNI));
-        else {
-          LI->addRange(LiveRange(Start, LIS.getMBBEndIdx(MBB), VNI));
-          LOP = LiveOutPair(VNI, Node);
-        }
-      } else if (IDomValue.first) {
-        // No phi-def here. Remember incoming value.
-        I->Value = IDomValue.first;
-        if (I->Kill.isValid())
-          continue;
-        // Propagate IDomValue if needed:
-        // MBB is live-out and doesn't define its own value.
-        if (LOP.second != Node && LOP.first != IDomValue.first) {
-          ++Changes;
-          LOP = IDomValue;
-        }
-      }
-    }
-  } while (Changes);
-
-  // The values in LiveInBlocks are now accurate. No more phi-defs are needed
-  // for these blocks, so we can color the live ranges.
-  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveInBlocks.begin(),
-         E = LiveInBlocks.end(); I != E; ++I) {
-    if (!I->DomNode)
-      continue;
-    assert(I->Value && "No live-in value found");
-    MachineBasicBlock *MBB = I->DomNode->getBlock();
-    SlotIndex Start = LIS.getMBBStartIdx(MBB);
-    unsigned RegIdx = RegAssign.lookup(Start);
-    LiveInterval *LI = Edit->get(RegIdx);
-    LI->addRange(LiveRange(Start, I->Kill.isValid() ?
-                                  I->Kill : LIS.getMBBEndIdx(MBB), I->Value));
-  }
+  // Mark as complex mapped, forced.
+  VFP = ValueForcePair(0, true);
 }
 
 VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
@@ -710,17 +520,28 @@ SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
   DEBUG(dbgs() << "    leaveIntvAfter " << Idx);
 
   // The interval must be live beyond the instruction at Idx.
-  Idx = Idx.getBoundaryIndex();
-  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  SlotIndex Boundary = Idx.getBoundaryIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Boundary);
   if (!ParentVNI) {
     DEBUG(dbgs() << ": not live\n");
-    return Idx.getNextSlot();
+    return Boundary.getNextSlot();
   }
   DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
-
-  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  MachineInstr *MI = LIS.getInstructionFromIndex(Boundary);
   assert(MI && "No instruction at index");
-  VNInfo *VNI = defFromParent(0, ParentVNI, Idx, *MI->getParent(),
+
+  // In spill mode, make live ranges as short as possible by inserting the copy
+  // before MI.  This is only possible if that instruction doesn't redefine the
+  // value.  The inserted COPY is not a kill, and we don't need to recompute
+  // the source live range.  The spiller also won't try to hoist this copy.
+  if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
+      MI->readsVirtualRegister(Edit->getReg())) {
+    forceRecompute(0, ParentVNI);
+    defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
+    return Idx;
+  }
+
+  VNInfo *VNI = defFromParent(0, ParentVNI, Boundary, *MI->getParent(),
                               llvm::next(MachineBasicBlock::iterator(MI)));
   return VNI->def;
 }
@@ -730,7 +551,7 @@ SlotIndex SplitEditor::leaveIntvBefore(SlotIndex Idx) {
   DEBUG(dbgs() << "    leaveIntvBefore " << Idx);
 
   // The interval must be live into the instruction at Idx.
-  Idx = Idx.getBoundaryIndex();
+  Idx = Idx.getBaseIndex();
   VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
   if (!ParentVNI) {
     DEBUG(dbgs() << ": not live\n");
@@ -770,19 +591,219 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
   assert(LIS.getMBBFromIndex(Start) == LIS.getMBBFromIndex(End) &&
          "Range cannot span basic blocks");
 
-  // The complement interval will be extended as needed by extendRange().
+  // The complement interval will be extended as needed by LRCalc.extend().
   if (ParentVNI)
-    markComplexMapped(0, ParentVNI);
+    forceRecompute(0, ParentVNI);
   DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
   RegAssign.insert(Start, End, OpenIdx);
   DEBUG(dump());
 }
 
+//===----------------------------------------------------------------------===//
+//                                  Spill modes
+//===----------------------------------------------------------------------===//
+
+void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
+  LiveInterval *LI = Edit->get(0);
+  DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
+  RegAssignMap::iterator AssignI;
+  AssignI.setMap(RegAssign);
+
+  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+    VNInfo *VNI = Copies[i];
+    SlotIndex Def = VNI->def;
+    MachineInstr *MI = LIS.getInstructionFromIndex(Def);
+    assert(MI && "No instruction for back-copy");
+
+    MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::iterator MBBI(MI);
+    bool AtBegin;
+    do AtBegin = MBBI == MBB->begin();
+    while (!AtBegin && (--MBBI)->isDebugValue());
+
+    DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
+    LI->removeValNo(VNI);
+    LIS.RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+
+    // Adjust RegAssign if a register assignment is killed at VNI->def.  We
+    // want to avoid calculating the live range of the source register if
+    // possible.
+    AssignI.find(VNI->def.getPrevSlot());
+    if (!AssignI.valid() || AssignI.start() >= Def)
+      continue;
+    // If MI doesn't kill the assigned register, just leave it.
+    if (AssignI.stop() != Def)
+      continue;
+    unsigned RegIdx = AssignI.value();
+    if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
+      DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx << '\n');
+      forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
+    } else {
+      SlotIndex Kill = LIS.getInstructionIndex(MBBI).getDefIndex();
+      DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
+      AssignI.setStop(Kill);
+    }
+  }
+}
+
+MachineBasicBlock*
+SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
+                                  MachineBasicBlock *DefMBB) {
+  if (MBB == DefMBB)
+    return MBB;
+  assert(MDT.dominates(DefMBB, MBB) && "MBB must be dominated by the def.");
+
+  const MachineLoopInfo &Loops = SA.Loops;
+  const MachineLoop *DefLoop = Loops.getLoopFor(DefMBB);
+  MachineDomTreeNode *DefDomNode = MDT[DefMBB];
+
+  // Best candidate so far.
+  MachineBasicBlock *BestMBB = MBB;
+  unsigned BestDepth = UINT_MAX;
+
+  for (;;) {
+    const MachineLoop *Loop = Loops.getLoopFor(MBB);
+
+    // MBB isn't in a loop, it doesn't get any better.  All dominators have a
+    // higher frequency by definition.
+    if (!Loop) {
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " at depth 0\n");
+      return MBB;
+    }
+
+    // We'll never be able to exit the DefLoop.
+    if (Loop == DefLoop) {
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " in the same loop\n");
+      return MBB;
+    }
+
+    // Least busy dominator seen so far.
+    unsigned Depth = Loop->getLoopDepth();
+    if (Depth < BestDepth) {
+      BestMBB = MBB;
+      BestDepth = Depth;
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " at depth " << Depth << '\n');
+    }
+
+    // Leave loop by going to the immediate dominator of the loop header.
+    // This is a bigger stride than simply walking up the dominator tree.
+    MachineDomTreeNode *IDom = MDT[Loop->getHeader()]->getIDom();
+
+    // Too far up the dominator tree?
+    if (!IDom || !MDT.dominates(DefDomNode, IDom))
+      return BestMBB;
+
+    MBB = IDom->getBlock();
+  }
+}
+
+void SplitEditor::hoistCopiesForSize() {
+  // Get the complement interval, always RegIdx 0.
+  LiveInterval *LI = Edit->get(0);
+  LiveInterval *Parent = &Edit->getParent();
+
+  // Track the nearest common dominator for all back-copies for each ParentVNI,
+  // indexed by ParentVNI->id.
+  typedef std::pair<MachineBasicBlock*, SlotIndex> DomPair;
+  SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+
+  // Find the nearest common dominator for parent values with multiple
+  // back-copies.  If a single back-copy dominates, put it in DomPair.second.
+  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
+       VI != VE; ++VI) {
+    VNInfo *VNI = *VI;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    assert(ParentVNI && "Parent not live at complement def");
+
+    // Don't hoist remats.  The complement is probably going to disappear
+    // completely anyway.
+    if (Edit->didRematerialize(ParentVNI))
+      continue;
+
+    MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+    DomPair &Dom = NearestDom[ParentVNI->id];
+
+    // Keep directly defined parent values.  This is either a PHI or an
+    // instruction in the complement range.  All other copies of ParentVNI
+    // should be eliminated.
+    if (VNI->def == ParentVNI->def) {
+      DEBUG(dbgs() << "Direct complement def at " << VNI->def << '\n');
+      Dom = DomPair(ValMBB, VNI->def);
+      continue;
+    }
+    // Skip the singly mapped values.  There is nothing to gain from hoisting a
+    // single back-copy.
+    if (Values.lookup(std::make_pair(0, ParentVNI->id)).getPointer()) {
+      DEBUG(dbgs() << "Single complement def at " << VNI->def << '\n');
+      continue;
+    }
+
+    if (!Dom.first) {
+      // First time we see ParentVNI.  VNI dominates itself.
+      Dom = DomPair(ValMBB, VNI->def);
+    } else if (Dom.first == ValMBB) {
+      // Two defs in the same block.  Pick the earlier def.
+      if (!Dom.second.isValid() || VNI->def < Dom.second)
+        Dom.second = VNI->def;
+    } else {
+      // Different basic blocks. Check if one dominates.
+      MachineBasicBlock *Near =
+        MDT.findNearestCommonDominator(Dom.first, ValMBB);
+      if (Near == ValMBB)
+        // Def ValMBB dominates.
+        Dom = DomPair(ValMBB, VNI->def);
+      else if (Near != Dom.first)
+        // None dominate. Hoist to common dominator, need new def.
+        Dom = DomPair(Near, SlotIndex());
+    }
+
+    DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
+                 << " for parent " << ParentVNI->id << '@' << ParentVNI->def
+                 << " hoist to BB#" << Dom.first->getNumber() << ' '
+                 << Dom.second << '\n');
+  }
+
+  // Insert the hoisted copies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    DomPair &Dom = NearestDom[i];
+    if (!Dom.first || Dom.second.isValid())
+      continue;
+    // This value needs a hoisted copy inserted at the end of Dom.first.
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
+    // Get a less loopy dominator than Dom.first.
+    Dom.first = findShallowDominator(Dom.first, DefMBB);
+    SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
+    Dom.second =
+      defFromParent(0, ParentVNI, Last, *Dom.first,
+                    LIS.getLastSplitPoint(Edit->getParent(), Dom.first))->def;
+  }
+
+  // Remove redundant back-copies that are now known to be dominated by another
+  // def with the same value.
+  SmallVector<VNInfo*, 8> BackCopies;
+  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
+       VI != VE; ++VI) {
+    VNInfo *VNI = *VI;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    const DomPair &Dom = NearestDom[ParentVNI->id];
+    if (!Dom.first || Dom.second == VNI->def)
+      continue;
+    BackCopies.push_back(VNI);
+    forceRecompute(0, ParentVNI);
+  }
+  removeBackCopies(BackCopies);
+}
+
+
 /// transferValues - Transfer all possible values to the new live ranges.
-/// Values that were rematerialized are left alone, they need extendRange().
+/// Values that were rematerialized are left alone, they need LRCalc.extend().
 bool SplitEditor::transferValues() {
   bool Skipped = false;
-  LiveInBlocks.clear();
   RegAssignMap::const_iterator AssignI = RegAssign.begin();
   for (LiveInterval::const_iterator ParentI = Edit->getParent().begin(),
          ParentE = Edit->getParent().end(); ParentI != ParentE; ++ParentI) {
@@ -812,28 +833,23 @@ bool SplitEditor::transferValues() {
       LiveInterval *LI = Edit->get(RegIdx);
 
       // Check for a simply defined value that can be blitted directly.
-      if (VNInfo *VNI = Values.lookup(std::make_pair(RegIdx, ParentVNI->id))) {
+      ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
+      if (VNInfo *VNI = VFP.getPointer()) {
         DEBUG(dbgs() << ':' << VNI->id);
         LI->addRange(LiveRange(Start, End, VNI));
         Start = End;
         continue;
       }
 
-      // Skip rematerialized values, we need to use extendRange() and
-      // extendPHIKillRanges() to completely recompute the live ranges.
-      if (Edit->didRematerialize(ParentVNI)) {
-        DEBUG(dbgs() << "(remat)");
+      // Skip values with forced recomputation.
+      if (VFP.getInt()) {
+        DEBUG(dbgs() << "(recalc)");
         Skipped = true;
         Start = End;
         continue;
       }
 
-      // Initialize the live-out cache the first time it is needed.
-      if (LiveOutSeen.empty()) {
-        unsigned N = VRM.getMachineFunction().getNumBlockIDs();
-        LiveOutSeen.resize(N);
-        LiveOutCache.resize(N);
-      }
+      LiveRangeCalc &LRC = getLRCalc(RegIdx);
 
       // This value has multiple defs in RegIdx, but it wasn't rematerialized,
       // so the live range is accurate. Add live-in blocks in [Start;End) to the
@@ -844,15 +860,13 @@ bool SplitEditor::transferValues() {
 
       // The first block may be live-in, or it may have its own def.
       if (Start != BlockStart) {
-        VNInfo *VNI = LI->extendInBlock(BlockStart,
-                                        std::min(BlockEnd, End).getPrevSlot());
+        VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
         assert(VNI && "Missing def for complex mapped value");
         DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
         // MBB has its own def. Is it also live-out?
-        if (BlockEnd <= End) {
-          LiveOutSeen.set(MBB->getNumber());
-          LiveOutCache[MBB] = LiveOutPair(VNI, MDT[MBB]);
-        }
+        if (BlockEnd <= End)
+          LRC.setLiveOutValue(MBB, VNI);
+
         // Skip to the next block for live-in.
         ++MBB;
         BlockStart = BlockEnd;
@@ -866,25 +880,19 @@ bool SplitEditor::transferValues() {
         if (BlockStart == ParentVNI->def) {
           // This block has the def of a parent PHI, so it isn't live-in.
           assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
-          VNInfo *VNI = LI->extendInBlock(BlockStart,
-                                         std::min(BlockEnd, End).getPrevSlot());
+          VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
           assert(VNI && "Missing def for complex mapped parent PHI");
-          if (End >= BlockEnd) {
-            // Live-out as well.
-            LiveOutSeen.set(MBB->getNumber());
-            LiveOutCache[MBB] = LiveOutPair(VNI, MDT[MBB]);
-          }
+          if (End >= BlockEnd)
+            LRC.setLiveOutValue(MBB, VNI); // Live-out as well.
         } else {
-          // This block needs a live-in value.
-          LiveInBlocks.push_back(MDT[MBB]);
-          // The last block covered may not be live-out.
+          // This block needs a live-in value.  The last block covered may not
+          // be live-out.
           if (End < BlockEnd)
-            LiveInBlocks.back().Kill = End;
+            LRC.addLiveInBlock(LI, MDT[MBB], End);
           else {
-            // Live-out, but we need updateSSA to tell us the value.
-            LiveOutSeen.set(MBB->getNumber());
-            LiveOutCache[MBB] = LiveOutPair((VNInfo*)0,
-                                            (MachineDomTreeNode*)0);
+            // Live-through, and we don't know the value.
+            LRC.addLiveInBlock(LI, MDT[MBB]);
+            LRC.setLiveOutValue(MBB, 0);
           }
         }
         BlockStart = BlockEnd;
@@ -895,8 +903,11 @@ bool SplitEditor::transferValues() {
     DEBUG(dbgs() << '\n');
   }
 
-  if (!LiveInBlocks.empty())
-    updateSSA();
+  LRCalc[0].calculateValues(LIS.getSlotIndexes(), &MDT,
+                            &LIS.getVNInfoAllocator());
+  if (SpillMode)
+    LRCalc[1].calculateValues(LIS.getSlotIndexes(), &MDT,
+                              &LIS.getVNInfoAllocator());
 
   return Skipped;
 }
@@ -909,16 +920,20 @@ void SplitEditor::extendPHIKillRanges() {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
+    LiveInterval *LI = Edit->get(RegIdx);
+    LiveRangeCalc &LRC = getLRCalc(RegIdx);
     MachineBasicBlock *MBB = LIS.getMBBFromIndex(PHIVNI->def);
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
          PE = MBB->pred_end(); PI != PE; ++PI) {
-      SlotIndex End = LIS.getMBBEndIdx(*PI).getPrevSlot();
+      SlotIndex End = LIS.getMBBEndIdx(*PI);
+      SlotIndex LastUse = End.getPrevSlot();
       // The predecessor may not have a live-out value. That is OK, like an
       // undef PHI operand.
-      if (Edit->getParent().liveAt(End)) {
-        assert(RegAssign.lookup(End) == RegIdx &&
+      if (Edit->getParent().liveAt(LastUse)) {
+        assert(RegAssign.lookup(LastUse) == RegIdx &&
                "Different register assignment in phi predecessor");
-        extendRange(RegIdx, End);
+        LRC.extend(LI, End,
+                   LIS.getSlotIndexes(), &MDT, &LIS.getVNInfoAllocator());
       }
     }
   }
@@ -938,25 +953,22 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
       continue;
     }
 
-    // <undef> operands don't really read the register, so just assign them to
-    // the complement.
-    if (MO.isUse() && MO.isUndef()) {
-      MO.setReg(Edit->get(0)->reg);
-      continue;
-    }
-
+    // <undef> operands don't really read the register, so it doesn't matter
+    // which register we choose.  When the use operand is tied to a def, we must
+    // use the same register as the def, so just do that always.
     SlotIndex Idx = LIS.getInstructionIndex(MI);
-    if (MO.isDef())
+    if (MO.isDef() || MO.isUndef())
       Idx = MO.isEarlyClobber() ? Idx.getUseIndex() : Idx.getDefIndex();
 
     // Rewrite to the mapped register at Idx.
     unsigned RegIdx = RegAssign.lookup(Idx);
-    MO.setReg(Edit->get(RegIdx)->reg);
+    LiveInterval *LI = Edit->get(RegIdx);
+    MO.setReg(LI->reg);
     DEBUG(dbgs() << "  rewr BB#" << MI->getParent()->getNumber() << '\t'
                  << Idx << ':' << RegIdx << '\t' << *MI);
 
     // Extend liveness to Idx if the instruction reads reg.
-    if (!ExtendRanges)
+    if (!ExtendRanges || MO.isUndef())
       continue;
 
     // Skip instructions that don't read Reg.
@@ -971,7 +983,8 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     } else
       Idx = Idx.getUseIndex();
 
-    extendRange(RegIdx, Idx);
+    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot(), LIS.getSlotIndexes(),
+                             &MDT, &LIS.getVNInfoAllocator());
   }
 }
 
@@ -1019,11 +1032,24 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     VNI->setIsPHIDef(ParentVNI->isPHIDef());
     VNI->setCopy(ParentVNI->getCopy());
 
-    // Mark rematted values as complex everywhere to force liveness computation.
+    // Force rematted values to be recomputed everywhere.
     // The new live ranges may be truncated.
     if (Edit->didRematerialize(ParentVNI))
       for (unsigned i = 0, e = Edit->size(); i != e; ++i)
-        markComplexMapped(i, ParentVNI);
+        forceRecompute(i, ParentVNI);
+  }
+
+  // Hoist back-copies to the complement interval when in spill mode.
+  switch (SpillMode) {
+  case SM_Partition:
+    // Leave all back-copies as is.
+    break;
+  case SM_Size:
+    hoistCopiesForSize();
+    break;
+  case SM_Speed:
+    llvm_unreachable("Spill mode 'speed' not implemented yet");
+    break;
   }
 
   // Transfer the simply mapped values, check if any are skipped.
@@ -1081,50 +1107,39 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
 //                            Single Block Splitting
 //===----------------------------------------------------------------------===//
 
-/// getMultiUseBlocks - if CurLI has more than one use in a basic block, it
-/// may be an advantage to split CurLI for the duration of the block.
-bool SplitAnalysis::getMultiUseBlocks(BlockPtrSet &Blocks) {
-  // If CurLI is local to one block, there is no point to splitting it.
-  if (UseBlocks.size() <= 1)
+bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
+                                           bool SingleInstrs) const {
+  // Always split for multiple instructions.
+  if (!BI.isOneInstr())
+    return true;
+  // Don't split for single instructions unless explicitly requested.
+  if (!SingleInstrs)
     return false;
-  // Add blocks with multiple uses.
-  for (unsigned i = 0, e = UseBlocks.size(); i != e; ++i) {
-    const BlockInfo &BI = UseBlocks[i];
-    if (BI.FirstUse == BI.LastUse)
-      continue;
-    Blocks.insert(BI.MBB);
-  }
-  return !Blocks.empty();
+  // Splitting a live-through range always makes progress.
+  if (BI.LiveIn && BI.LiveOut)
+    return true;
+  // No point in isolating a copy. It has no register class constraints.
+  if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
+    return false;
+  // Finally, don't isolate an end point that was created by earlier splits.
+  return isOriginalEndpoint(BI.FirstInstr);
 }
 
 void SplitEditor::splitSingleBlock(const SplitAnalysis::BlockInfo &BI) {
   openIntv();
   SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber());
-  SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstUse,
+  SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstInstr,
     LastSplitPoint));
-  if (!BI.LiveOut || BI.LastUse < LastSplitPoint) {
-    useIntv(SegStart, leaveIntvAfter(BI.LastUse));
+  if (!BI.LiveOut || BI.LastInstr < LastSplitPoint) {
+    useIntv(SegStart, leaveIntvAfter(BI.LastInstr));
   } else {
       // The last use is after the last valid split point.
     SlotIndex SegStop = leaveIntvBefore(LastSplitPoint);
     useIntv(SegStart, SegStop);
-    overlapIntv(SegStop, BI.LastUse);
+    overlapIntv(SegStop, BI.LastInstr);
   }
 }
 
-/// splitSingleBlocks - Split CurLI into a separate live interval inside each
-/// basic block in Blocks.
-void SplitEditor::splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks) {
-  DEBUG(dbgs() << "  splitSingleBlocks for " << Blocks.size() << " blocks.\n");
-  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA.getUseBlocks();
-  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
-    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
-    if (Blocks.count(BI.MBB))
-      splitSingleBlock(BI);
-  }
-  finish();
-}
-
 
 //===----------------------------------------------------------------------===//
 //                    Global Live Range Splitting Support
@@ -1149,6 +1164,12 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
 
   assert((IntvIn || IntvOut) && "Use splitSingleBlock for isolated blocks");
 
+  assert((!LeaveBefore || LeaveBefore < Stop) && "Interference after block");
+  assert((!IntvIn || !LeaveBefore || LeaveBefore > Start) && "Impossible intf");
+  assert((!EnterAfter || EnterAfter >= Start) && "Interference before block");
+
+  MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);
+
   if (!IntvOut) {
     DEBUG(dbgs() << ", spill on entry.\n");
     //
@@ -1157,7 +1178,6 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
     //    -____________    Spill on entry.
     //
     selectIntv(IntvIn);
-    MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);
     SlotIndex Idx = leaveIntvAtTop(*MBB);
     assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
     (void)Idx;
@@ -1172,7 +1192,6 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
     //    ___________--    Reload on exit.
     //
     selectIntv(IntvOut);
-    MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);
     SlotIndex Idx = enterIntvAtEnd(*MBB);
     assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
     (void)Idx;
@@ -1192,6 +1211,7 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
 
   // We cannot legally insert splits after LSP.
   SlotIndex LSP = SA.getLastSplitPoint(MBBNum);
+  assert((!IntvOut || !EnterAfter || EnterAfter < LSP) && "Impossible intf");
 
   if (IntvIn != IntvOut && (!LeaveBefore || !EnterAfter ||
                   LeaveBefore.getBaseIndex() > EnterAfter.getBoundaryIndex())) {
@@ -1201,10 +1221,14 @@ void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
     //    |-----------|    Live through.
     //    ------=======    Switch intervals between interference.
     //
-    SlotIndex Cut = (LeaveBefore && LeaveBefore < LSP) ? LeaveBefore : LSP;
     selectIntv(IntvOut);
-    SlotIndex Idx = enterIntvBefore(Cut);
-    useIntv(Idx, Stop);
+    SlotIndex Idx;
+    if (LeaveBefore && LeaveBefore < LSP) {
+      Idx = enterIntvBefore(LeaveBefore);
+      useIntv(Idx, Stop);
+    } else {
+      Idx = enterIntvAtEnd(*MBB);
+    }
     selectIntv(IntvIn);
     useIntv(Start, Idx);
     assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
@@ -1238,7 +1262,7 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
   tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
 
   DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " [" << Start << ';' << Stop
-               << "), uses " << BI.FirstUse << '-' << BI.LastUse
+               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
                << ", reg-in " << IntvIn << ", leave before " << LeaveBefore
                << (BI.LiveOut ? ", stack-out" : ", killed in block"));
 
@@ -1246,7 +1270,7 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
   assert(BI.LiveIn && "Must be live-in");
   assert((!LeaveBefore || LeaveBefore > Start) && "Bad interference");
 
-  if (!BI.LiveOut && (!LeaveBefore || LeaveBefore >= BI.LastUse)) {
+  if (!BI.LiveOut && (!LeaveBefore || LeaveBefore >= BI.LastInstr)) {
     DEBUG(dbgs() << " before interference.\n");
     //
     //               <<<    Interference after kill.
@@ -1254,13 +1278,13 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
     //     =========        Use IntvIn everywhere.
     //
     selectIntv(IntvIn);
-    useIntv(Start, BI.LastUse);
+    useIntv(Start, BI.LastInstr);
     return;
   }
 
   SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());
 
-  if (!LeaveBefore || LeaveBefore > BI.LastUse.getBoundaryIndex()) {
+  if (!LeaveBefore || LeaveBefore > BI.LastInstr.getBoundaryIndex()) {
     //
     //               <<<    Possible interference after last use.
     //     |---o---o---|    Live-out on stack.
@@ -1271,17 +1295,17 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
     //     ============     Copy to stack after LSP, overlap IntvIn.
     //            \_____    Stack interval is live-out.
     //
-    if (BI.LastUse < LSP) {
+    if (BI.LastInstr < LSP) {
       DEBUG(dbgs() << ", spill after last use before interference.\n");
       selectIntv(IntvIn);
-      SlotIndex Idx = leaveIntvAfter(BI.LastUse);
+      SlotIndex Idx = leaveIntvAfter(BI.LastInstr);
       useIntv(Start, Idx);
       assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
     } else {
       DEBUG(dbgs() << ", spill before last split point.\n");
       selectIntv(IntvIn);
       SlotIndex Idx = leaveIntvBefore(LSP);
-      overlapIntv(Idx, BI.LastUse);
+      overlapIntv(Idx, BI.LastInstr);
       useIntv(Start, Idx);
       assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
     }
@@ -1295,13 +1319,13 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
   (void)LocalIntv;
   DEBUG(dbgs() << ", creating local interval " << LocalIntv << ".\n");
 
-  if (!BI.LiveOut || BI.LastUse < LSP) {
+  if (!BI.LiveOut || BI.LastInstr < LSP) {
     //
     //           <<<<<<<    Interference overlapping uses.
     //     |---o---o---|    Live-out on stack.
     //     =====----____    Leave IntvIn before interference, then spill.
     //
-    SlotIndex To = leaveIntvAfter(BI.LastUse);
+    SlotIndex To = leaveIntvAfter(BI.LastInstr);
     SlotIndex From = enterIntvBefore(LeaveBefore);
     useIntv(From, To);
     selectIntv(IntvIn);
@@ -1316,7 +1340,7 @@ void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
   //            \_____    Stack interval is live-out.
   //
   SlotIndex To = leaveIntvBefore(LSP);
-  overlapIntv(To, BI.LastUse);
+  overlapIntv(To, BI.LastInstr);
   SlotIndex From = enterIntvBefore(std::min(To, LeaveBefore));
   useIntv(From, To);
   selectIntv(IntvIn);
@@ -1330,7 +1354,7 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
   tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
 
   DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " [" << Start << ';' << Stop
-               << "), uses " << BI.FirstUse << '-' << BI.LastUse
+               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
                << ", reg-out " << IntvOut << ", enter after " << EnterAfter
                << (BI.LiveIn ? ", stack-in" : ", defined in block"));
 
@@ -1340,7 +1364,7 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
   assert(BI.LiveOut && "Must be live-out");
   assert((!EnterAfter || EnterAfter < LSP) && "Bad interference");
 
-  if (!BI.LiveIn && (!EnterAfter || EnterAfter <= BI.FirstUse)) {
+  if (!BI.LiveIn && (!EnterAfter || EnterAfter <= BI.FirstInstr)) {
     DEBUG(dbgs() << " after interference.\n");
     //
     //    >>>>             Interference before def.
@@ -1348,11 +1372,11 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
     //        =========    Use IntvOut everywhere.
     //
     selectIntv(IntvOut);
-    useIntv(BI.FirstUse, Stop);
+    useIntv(BI.FirstInstr, Stop);
     return;
   }
 
-  if (!EnterAfter || EnterAfter < BI.FirstUse.getBaseIndex()) {
+  if (!EnterAfter || EnterAfter < BI.FirstInstr.getBaseIndex()) {
     DEBUG(dbgs() << ", reload after interference.\n");
     //
     //    >>>>             Interference before def.
@@ -1360,7 +1384,7 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
     //    ____=========    Enter IntvOut before first use.
     //
     selectIntv(IntvOut);
-    SlotIndex Idx = enterIntvBefore(std::min(LSP, BI.FirstUse));
+    SlotIndex Idx = enterIntvBefore(std::min(LSP, BI.FirstInstr));
     useIntv(Idx, Stop);
     assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
     return;
@@ -1381,6 +1405,6 @@ void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
   assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
 
   openIntv();
-  SlotIndex From = enterIntvBefore(std::min(Idx, BI.FirstUse));
+  SlotIndex From = enterIntvBefore(std::min(Idx, BI.FirstInstr));
   useIntv(From, Idx);
 }
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 7948b72..d8fc212 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -15,13 +15,11 @@
 #ifndef LLVM_CODEGEN_SPLITKIT_H
 #define LLVM_CODEGEN_SPLITKIT_H
 
+#include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/IntervalMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 
 namespace llvm {
 
@@ -38,12 +36,6 @@ class VirtRegMap;
 class VNInfo;
 class raw_ostream;
 
-/// At some point we should just include MachineDominators.h:
-class MachineDominatorTree;
-template <class NodeT> class DomTreeNodeBase;
-typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
-
-
 /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
 /// opportunities.
 class SplitAnalysis {
@@ -76,16 +68,16 @@ public:
   ///
   struct BlockInfo {
     MachineBasicBlock *MBB;
-    SlotIndex FirstUse;   ///< First instr using current reg.
-    SlotIndex LastUse;    ///< Last instr using current reg.
-    bool LiveThrough;     ///< Live in whole block (Templ 5. above).
+    SlotIndex FirstInstr; ///< First instr accessing current reg.
+    SlotIndex LastInstr;  ///< Last instr accessing current reg.
+    SlotIndex FirstDef;   ///< First non-phi valno->def, or SlotIndex().
     bool LiveIn;          ///< Current reg is live in.
     bool LiveOut;         ///< Current reg is live out.
 
     /// isOneInstr - Returns true when this BlockInfo describes a single
     /// instruction.
     bool isOneInstr() const {
-      return SlotIndex::isSameInstr(FirstUse, LastUse);
+      return SlotIndex::isSameInstr(FirstInstr, LastInstr);
     }
   };
 
@@ -185,10 +177,15 @@ public:
 
   typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet;
 
-  /// getMultiUseBlocks - Add basic blocks to Blocks that may benefit from
-  /// having CurLI split to a new live interval. Return true if Blocks can be
-  /// passed to SplitEditor::splitSingleBlocks.
-  bool getMultiUseBlocks(BlockPtrSet &Blocks);
+  /// shouldSplitSingleBlock - Returns true if it would help to create a local
+  /// live range for the instructions in BI. There is normally no benefit to
+  /// creating a live range for a single instruction, but it does enable
+  /// register class inflation if the instruction has a restricted register
+  /// class.
+  ///
+  /// @param BI           The block to be isolated.
+  /// @param SingleInstrs True when single instructions should be isolated.
+  bool shouldSplitSingleBlock(const BlockInfo &BI, bool SingleInstrs) const;
 };
 
 
@@ -212,6 +209,36 @@ class SplitEditor {
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
 
+public:
+
+  /// ComplementSpillMode - Select how the complement live range should be
+  /// created.  SplitEditor automatically creates interval 0 to contain
+  /// anything that isn't added to another interval.  This complement interval
+  /// can get quite complicated, and it can sometimes be an advantage to allow
+  /// it to overlap the other intervals.  If it is going to spill anyway, no
+  /// registers are wasted by keeping a value in two places at the same time.
+  enum ComplementSpillMode {
+    /// SM_Partition(Default) - Try to create the complement interval so it
+    /// doesn't overlap any other intervals, and the original interval is
+    /// partitioned.  This may require a large number of back copies and extra
+    /// PHI-defs.  Only segments marked with overlapIntv will be overlapping.
+    SM_Partition,
+
+    /// SM_Size - Overlap intervals to minimize the number of inserted COPY
+    /// instructions.  Copies to the complement interval are hoisted to their
+    /// common dominator, so only one COPY is required per value in the
+    /// complement interval.  This also means that no extra PHI-defs need to be
+    /// inserted in the complement interval.
+    SM_Size,
+
+    /// SM_Speed - Overlap intervals to minimize the expected execution
+    /// frequency of the inserted copies.  This is very similar to SM_Size, but
+    /// the complement interval may get some extra PHI-defs.
+    SM_Speed
+  };
+
+private:
+
   /// Edit - The current parent register and new intervals created.
   LiveRangeEdit *Edit;
 
@@ -220,6 +247,9 @@ class SplitEditor {
   /// openIntv will be 1.
   unsigned OpenIdx;
 
+  /// The current spill mode, selected by reset().
+  ComplementSpillMode SpillMode;
+
   typedef IntervalMap<SlotIndex, unsigned> RegAssignMap;
 
   /// Allocator for the interval map. This will eventually be shared with
@@ -231,65 +261,34 @@ class SplitEditor {
   /// Idx.
   RegAssignMap RegAssign;
 
-  typedef DenseMap<std::pair<unsigned, unsigned>, VNInfo*> ValueMap;
+  typedef PointerIntPair<VNInfo*, 1> ValueForcePair;
+  typedef DenseMap<std::pair<unsigned, unsigned>, ValueForcePair> ValueMap;
 
   /// Values - keep track of the mapping from parent values to values in the new
   /// intervals. Given a pair (RegIdx, ParentVNI->id), Values contains:
   ///
   /// 1. No entry - the value is not mapped to Edit.get(RegIdx).
-  /// 2. Null - the value is mapped to multiple values in Edit.get(RegIdx).
-  ///    Each value is represented by a minimal live range at its def.
-  /// 3. A non-null VNInfo - the value is mapped to a single new value.
+  /// 2. (Null, false) - the value is mapped to multiple values in
+  ///    Edit.get(RegIdx).  Each value is represented by a minimal live range at
+  ///    its def.  The full live range can be inferred exactly from the range
+  ///    of RegIdx in RegAssign.
+  /// 3. (Null, true).  As above, but the ranges in RegAssign are too large, and
+  ///    the live range must be recomputed using LiveRangeCalc::extend().
+  /// 4. (VNI, false) The value is mapped to a single new value.
   ///    The new value has no live ranges anywhere.
   ValueMap Values;
 
-  typedef std::pair<VNInfo*, MachineDomTreeNode*> LiveOutPair;
-  typedef IndexedMap<LiveOutPair, MBB2NumberFunctor> LiveOutMap;
-
-  // LiveOutCache - Map each basic block where a new register is live out to the
-  // live-out value and its defining block.
-  // One of these conditions shall be true:
-  //
-  //  1. !LiveOutCache.count(MBB)
-  //  2. LiveOutCache[MBB].second.getNode() == MBB
-  //  3. forall P in preds(MBB): LiveOutCache[P] == LiveOutCache[MBB]
-  //
-  // This is only a cache, the values can be computed as:
-  //
-  //  VNI = Edit.get(RegIdx)->getVNInfoAt(LIS.getMBBEndIdx(MBB))
-  //  Node = mbt_[LIS.getMBBFromIndex(VNI->def)]
-  //
-  // The cache is also used as a visited set by extendRange(). It can be shared
-  // by all the new registers because at most one is live out of each block.
-  LiveOutMap LiveOutCache;
-
-  // LiveOutSeen - Indexed by MBB->getNumber(), a bit is set for each valid
-  // entry in LiveOutCache.
-  BitVector LiveOutSeen;
-
-  /// LiveInBlock - Info for updateSSA() about a block where a register is
-  /// live-in.
-  /// The updateSSA caller provides DomNode and Kill inside MBB, updateSSA()
-  /// adds the computed live-in value.
-  struct LiveInBlock {
-    // Dominator tree node for the block.
-    // Cleared by updateSSA when the final value has been determined.
-    MachineDomTreeNode *DomNode;
-
-    // Live-in value filled in by updateSSA once it is known.
-    VNInfo *Value;
-
-    // Position in block where the live-in range ends, or SlotIndex() if the
-    // range passes through the block.
-    SlotIndex Kill;
-
-    LiveInBlock(MachineDomTreeNode *node) : DomNode(node), Value(0) {}
-  };
+  /// LRCalc - Cache for computing live ranges and SSA update.  Each instance
+  /// can only handle non-overlapping live ranges, so use a separate
+  /// LiveRangeCalc instance for the complement interval when in spill mode.
+  LiveRangeCalc LRCalc[2];
 
-  /// LiveInBlocks - List of live-in blocks used by findReachingDefs() and
-  /// updateSSA(). This list is usually empty, it exists here to avoid frequent
-  /// reallocations.
-  SmallVector<LiveInBlock, 16> LiveInBlocks;
+  /// getLRCalc - Return the LRCalc to use for RegIdx.  In spill mode, the
+  /// complement interval can overlap the other intervals, so it gets its own
+  /// LRCalc instance.  When not in spill mode, all intervals can share one.
+  LiveRangeCalc &getLRCalc(unsigned RegIdx) {
+    return LRCalc[SpillMode != SM_Partition && RegIdx != 0];
+  }
 
   /// defValue - define a value in RegIdx from ParentVNI at Idx.
   /// Idx does not have to be ParentVNI->def, but it must be contained within
@@ -298,9 +297,11 @@ class SplitEditor {
   /// Return the new LI value.
   VNInfo *defValue(unsigned RegIdx, const VNInfo *ParentVNI, SlotIndex Idx);
 
-  /// markComplexMapped - Mark ParentVNI as complex mapped in RegIdx regardless
-  /// of the number of defs.
-  void markComplexMapped(unsigned RegIdx, const VNInfo *ParentVNI);
+  /// forceRecompute - Force the live range of ParentVNI in RegIdx to be
+  /// recomputed by LiveRangeCalc::extend regardless of the number of defs.
+  /// This is used for values whose live range doesn't match RegAssign exactly.
+  /// They could have rematerialized, or back-copies may have been moved.
+  void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
 
   /// defFromParent - Define Reg from ParentVNI at UseIdx using either
   /// rematerialization or a COPY from parent. Return the new value.
@@ -310,22 +311,18 @@ class SplitEditor {
                         MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator I);
 
-  /// extendRange - Extend the live range of Edit.get(RegIdx) so it reaches Idx.
-  /// Insert PHIDefs as needed to preserve SSA form.
-  void extendRange(unsigned RegIdx, SlotIndex Idx);
+  /// removeBackCopies - Remove the copy instructions that defines the values
+  /// in the vector in the complement interval.
+  void removeBackCopies(SmallVectorImpl<VNInfo*> &Copies);
 
-  /// findReachingDefs - Starting from MBB, add blocks to LiveInBlocks until all
-  /// reaching defs for LI are found.
-  /// @param LI   Live interval whose value is needed.
-  /// @param MBB  Block where LI should be live-in.
-  /// @param Kill Kill point in MBB.
-  /// @return Unique value seen, or NULL.
-  VNInfo *findReachingDefs(LiveInterval *LI, MachineBasicBlock *MBB,
-                           SlotIndex Kill);
+  /// getShallowDominator - Returns the least busy dominator of MBB that is
+  /// also dominated by DefMBB.  Busy is measured by loop depth.
+  MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
+                                          MachineBasicBlock *DefMBB);
 
-  /// updateSSA - Compute and insert PHIDefs such that all blocks in
-  // LiveInBlocks get a known live-in value. Add live ranges to the blocks.
-  void updateSSA();
+  /// hoistCopiesForSize - Hoist back-copies to the complement interval in a
+  /// way that minimizes code size. This implements the SM_Size spill mode.
+  void hoistCopiesForSize();
 
   /// transferValues - Transfer values to the new ranges.
   /// Return true if any ranges were skipped.
@@ -348,7 +345,7 @@ public:
               MachineDominatorTree&);
 
   /// reset - Prepare for a new split.
-  void reset(LiveRangeEdit&);
+  void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
 
   /// Create a new virtual register and live interval.
   /// Return the interval index, starting from 1. Interval index 0 is the
@@ -423,10 +420,6 @@ public:
   /// split, and doesn't call finish().
   void splitSingleBlock(const SplitAnalysis::BlockInfo &BI);
 
-  /// splitSingleBlocks - Split CurLI into a separate live interval inside each
-  /// basic block in Blocks.
-  void splitSingleBlocks(const SplitAnalysis::BlockPtrSet &Blocks);
-
   /// splitLiveThroughBlock - Split CurLI in the given block such that it
   /// enters the block in IntvIn and leaves it in IntvOut. There may be uses in
   /// the block, but they will be ignored when placing split points.
diff --git a/lib/CodeGen/Splitter.cpp b/lib/CodeGen/Splitter.cpp
index ec75df4..77973b7 100644
--- a/lib/CodeGen/Splitter.cpp
+++ b/lib/CodeGen/Splitter.cpp
@@ -11,7 +11,6 @@
 
 #include "Splitter.h"
 
-#include "RegisterCoalescer.h"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -20,6 +19,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -262,7 +262,7 @@ namespace llvm {
     au.addPreserved<MachineDominatorTree>();
     au.addRequired<MachineLoopInfo>();
     au.addPreserved<MachineLoopInfo>();
-    au.addPreserved<RegisterCoalescer>();
+    au.addPreservedID(RegisterCoalescerPassID);
     au.addPreserved<CalculateSpillWeights>();
     au.addPreserved<LiveStacks>();
     au.addRequired<SlotIndexes>();
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index d3cbd15..1f0e5a2 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -123,7 +123,7 @@ bool StackProtector::RequiresStackProtector() const {
           // protectors.
           return true;
 
-        if (const ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType())) {
+        if (ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType())) {
           // We apparently only care about character arrays.
           if (!AT->getElementType()->isIntegerTy(8))
             continue;
@@ -165,7 +165,7 @@ bool StackProtector::InsertStackProtectors() {
       //     StackGuard = load __stack_chk_guard
       //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
       // 
-      const PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
+      PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
       unsigned AddressSpace, Offset;
       if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
         Constant *OffsetVal =
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index 227eb47..260cc0e 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -47,6 +47,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
@@ -673,7 +674,7 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
     if (PHIColor && SrcColor == PHIColor) {
       LiveInterval &SrcInterval = LI->getInterval(SrcReg);
       SlotIndex PredIndex = LI->getMBBEndIdx(PredBB);
-      VNInfo *SrcVNI = SrcInterval.getVNInfoAt(PredIndex.getPrevIndex());
+      VNInfo *SrcVNI = SrcInterval.getVNInfoBefore(PredIndex);
       assert(SrcVNI);
       SrcVNI->setHasPHIKill(true);
       continue;
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 6b801cb..3a6211a 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 86e71d8..f32678f 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -74,23 +74,25 @@ MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
 
   assert(MI->getOperand(Idx1).isReg() && MI->getOperand(Idx2).isReg() &&
          "This only knows how to commute register operands so far");
+  unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
   unsigned Reg1 = MI->getOperand(Idx1).getReg();
   unsigned Reg2 = MI->getOperand(Idx2).getReg();
   bool Reg1IsKill = MI->getOperand(Idx1).isKill();
   bool Reg2IsKill = MI->getOperand(Idx2).isKill();
-  bool ChangeReg0 = false;
-  if (HasDef && MI->getOperand(0).getReg() == Reg1) {
-    // Must be two address instruction!
-    assert(MI->getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
-           "Expecting a two-address instruction!");
+  // If destination is tied to either of the commuted source register, then
+  // it must be updated.
+  if (HasDef && Reg0 == Reg1 &&
+      MI->getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO) == 0) {
     Reg2IsKill = false;
-    ChangeReg0 = true;
+    Reg0 = Reg2;
+  } else if (HasDef && Reg0 == Reg2 &&
+             MI->getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO) == 0) {
+    Reg1IsKill = false;
+    Reg0 = Reg1;
   }
 
   if (NewMI) {
     // Create a new instruction.
-    unsigned Reg0 = HasDef
-      ? (ChangeReg0 ? Reg2 : MI->getOperand(0).getReg()) : 0;
     bool Reg0IsDead = HasDef ? MI->getOperand(0).isDead() : false;
     MachineFunction &MF = *MI->getParent()->getParent();
     if (HasDef)
@@ -104,8 +106,8 @@ MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
         .addReg(Reg1, getKillRegState(Reg2IsKill));
   }
 
-  if (ChangeReg0)
-    MI->getOperand(0).setReg(Reg2);
+  if (HasDef)
+    MI->getOperand(0).setReg(Reg0);
   MI->getOperand(Idx2).setReg(Reg1);
   MI->getOperand(Idx1).setReg(Reg2);
   MI->getOperand(Idx2).setIsKill(Reg1IsKill);
@@ -160,6 +162,42 @@ bool TargetInstrInfoImpl::PredicateInstruction(MachineInstr *MI,
   return MadeChange;
 }
 
+bool TargetInstrInfoImpl::hasLoadFromStackSlot(const MachineInstr *MI,
+                                        const MachineMemOperand *&MMO,
+                                        int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end();
+       o != oe;
+       ++o) {
+    if ((*o)->isLoad() && (*o)->getValue())
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+  }
+  return false;
+}
+
+bool TargetInstrInfoImpl::hasStoreToStackSlot(const MachineInstr *MI,
+                                       const MachineMemOperand *&MMO,
+                                       int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end();
+       o != oe;
+       ++o) {
+    if ((*o)->isStore() && (*o)->getValue())
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+  }
+  return false;
+}
+
 void TargetInstrInfoImpl::reMaterialize(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
                                         unsigned DestReg,
@@ -324,6 +362,19 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
   const TargetInstrInfo &TII = *TM.getInstrInfo();
   const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
 
+  // Remat clients assume operand 0 is the defined register.
+  if (!MI->getNumOperands() || !MI->getOperand(0).isReg())
+    return false;
+  unsigned DefReg = MI->getOperand(0).getReg();
+
+  // A sub-register definition can only be rematerialized if the instruction
+  // doesn't read the other parts of the register.  Otherwise it is really a
+  // read-modify-write operation on the full virtual register which cannot be
+  // moved safely.
+  if (TargetRegisterInfo::isVirtualRegister(DefReg) &&
+      MI->getOperand(0).getSubReg() && MI->readsVirtualRegister(DefReg))
+    return false;
+
   // A load from a fixed stack slot can be rematerialized. This may be
   // redundant with subsequent checks, but it's target-independent,
   // simple, and a common case.
@@ -383,8 +434,9 @@ isReallyTriviallyReMaterializableGeneric(const MachineInstr *MI,
       continue;
     }
 
-    // Only allow one virtual-register def, and that in the first operand.
-    if (MO.isDef() != (i == 0))
+    // Only allow one virtual-register def.  There may be multiple defs of the
+    // same virtual register, though.
+    if (MO.isDef() && Reg != DefReg)
       return false;
 
     // Don't allow any virtual-register uses. Rematting an instruction with
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index a3c5620..fb87154 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -43,153 +43,6 @@ using namespace dwarf;
 //                                  ELF
 //===----------------------------------------------------------------------===//
 
-TargetLoweringObjectFileELF::TargetLoweringObjectFileELF()
-  : TargetLoweringObjectFile(),
-    TLSDataSection(0),
-    TLSBSSSection(0),
-    DataRelSection(0),
-    DataRelLocalSection(0),
-    DataRelROSection(0),
-    DataRelROLocalSection(0),
-    MergeableConst4Section(0),
-    MergeableConst8Section(0),
-    MergeableConst16Section(0) {
-}
-
-void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
-                                             const TargetMachine &TM) {
-  TargetLoweringObjectFile::Initialize(Ctx, TM);
-
-  BSSSection =
-    getContext().getELFSection(".bss", ELF::SHT_NOBITS,
-                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                               SectionKind::getBSS());
-
-  TextSection =
-    getContext().getELFSection(".text", ELF::SHT_PROGBITS,
-                               ELF::SHF_EXECINSTR |
-                               ELF::SHF_ALLOC,
-                               SectionKind::getText());
-
-  DataSection =
-    getContext().getELFSection(".data", ELF::SHT_PROGBITS,
-                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
-
-  ReadOnlySection =
-    getContext().getELFSection(".rodata", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC,
-                               SectionKind::getReadOnly());
-
-  TLSDataSection =
-    getContext().getELFSection(".tdata", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC | ELF::SHF_TLS |
-                               ELF::SHF_WRITE,
-                               SectionKind::getThreadData());
-
-  TLSBSSSection =
-    getContext().getELFSection(".tbss", ELF::SHT_NOBITS,
-                               ELF::SHF_ALLOC | ELF::SHF_TLS |
-                               ELF::SHF_WRITE,
-                               SectionKind::getThreadBSS());
-
-  DataRelSection =
-    getContext().getELFSection(".data.rel", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                               SectionKind::getDataRel());
-
-  DataRelLocalSection =
-    getContext().getELFSection(".data.rel.local", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                               SectionKind::getDataRelLocal());
-
-  DataRelROSection =
-    getContext().getELFSection(".data.rel.ro", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                               SectionKind::getReadOnlyWithRel());
-
-  DataRelROLocalSection =
-    getContext().getELFSection(".data.rel.ro.local", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                               SectionKind::getReadOnlyWithRelLocal());
-
-  MergeableConst4Section =
-    getContext().getELFSection(".rodata.cst4", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_MERGE,
-                               SectionKind::getMergeableConst4());
-
-  MergeableConst8Section =
-    getContext().getELFSection(".rodata.cst8", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_MERGE,
-                               SectionKind::getMergeableConst8());
-
-  MergeableConst16Section =
-    getContext().getELFSection(".rodata.cst16", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_MERGE,
-                               SectionKind::getMergeableConst16());
-
-  StaticCtorSection =
-    getContext().getELFSection(".ctors", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                               SectionKind::getDataRel());
-
-  StaticDtorSection =
-    getContext().getELFSection(".dtors", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC |ELF::SHF_WRITE,
-                               SectionKind::getDataRel());
-
-  // Exception Handling Sections.
-
-  // FIXME: We're emitting LSDA info into a readonly section on ELF, even though
-  // it contains relocatable pointers.  In PIC mode, this is probably a big
-  // runtime hit for C++ apps.  Either the contents of the LSDA need to be
-  // adjusted or this should be a data section.
-  LSDASection =
-    getContext().getELFSection(".gcc_except_table", ELF::SHT_PROGBITS,
-                               ELF::SHF_ALLOC,
-                               SectionKind::getReadOnly());
-  // Debug Info Sections.
-  DwarfAbbrevSection =
-    getContext().getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfInfoSection =
-    getContext().getELFSection(".debug_info", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfLineSection =
-    getContext().getELFSection(".debug_line", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfFrameSection =
-    getContext().getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfPubNamesSection =
-    getContext().getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfPubTypesSection =
-    getContext().getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfStrSection =
-    getContext().getELFSection(".debug_str", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfLocSection =
-    getContext().getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfARangesSection =
-    getContext().getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfRangesSection =
-    getContext().getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-  DwarfMacroInfoSection =
-    getContext().getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0,
-                               SectionKind::getMetadata());
-}
-
-const MCSection *TargetLoweringObjectFileELF::getEHFrameSection() const {
-  return getContext().getELFSection(".eh_frame", ELF::SHT_PROGBITS,
-                                    ELF::SHF_ALLOC,
-                                    SectionKind::getDataRel());
-}
-
 MCSymbol *
 TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV,
                                                      Mangler *Mang,
@@ -493,221 +346,6 @@ getExprForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
 //                                 MachO
 //===----------------------------------------------------------------------===//
 
-TargetLoweringObjectFileMachO::TargetLoweringObjectFileMachO()
-  : TargetLoweringObjectFile(),
-    TLSDataSection(0),
-    TLSBSSSection(0),
-    TLSTLVSection(0),
-    TLSThreadInitSection(0),
-    CStringSection(0),
-    UStringSection(0),
-    TextCoalSection(0),
-    ConstTextCoalSection(0),
-    ConstDataSection(0),
-    DataCoalSection(0),
-    DataCommonSection(0),
-    DataBSSSection(0),
-    FourByteConstantSection(0),
-    EightByteConstantSection(0),
-    SixteenByteConstantSection(0),
-    LazySymbolPointerSection(0),
-    NonLazySymbolPointerSection(0) {
-}
-
-void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
-                                               const TargetMachine &TM) {
-  IsFunctionEHFrameSymbolPrivate = false;
-  SupportsWeakOmittedEHFrame = false;
-
-  // .comm doesn't support alignment before Leopard.
-  Triple T(((LLVMTargetMachine&)TM).getTargetTriple());
-  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 5))
-    CommDirectiveSupportsAlignment = false;
-
-  TargetLoweringObjectFile::Initialize(Ctx, TM);
-
-  TextSection // .text
-    = getContext().getMachOSection("__TEXT", "__text",
-                                   MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
-                                   SectionKind::getText());
-  DataSection // .data
-    = getContext().getMachOSection("__DATA", "__data", 0,
-                                   SectionKind::getDataRel());
-
-  TLSDataSection // .tdata
-    = getContext().getMachOSection("__DATA", "__thread_data",
-                                   MCSectionMachO::S_THREAD_LOCAL_REGULAR,
-                                   SectionKind::getDataRel());
-  TLSBSSSection // .tbss
-    = getContext().getMachOSection("__DATA", "__thread_bss",
-                                   MCSectionMachO::S_THREAD_LOCAL_ZEROFILL,
-                                   SectionKind::getThreadBSS());
-
-  // TODO: Verify datarel below.
-  TLSTLVSection // .tlv
-    = getContext().getMachOSection("__DATA", "__thread_vars",
-                                   MCSectionMachO::S_THREAD_LOCAL_VARIABLES,
-                                   SectionKind::getDataRel());
-
-  TLSThreadInitSection
-    = getContext().getMachOSection("__DATA", "__thread_init",
-                          MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS,
-                          SectionKind::getDataRel());
-
-  CStringSection // .cstring
-    = getContext().getMachOSection("__TEXT", "__cstring",
-                                   MCSectionMachO::S_CSTRING_LITERALS,
-                                   SectionKind::getMergeable1ByteCString());
-  UStringSection
-    = getContext().getMachOSection("__TEXT","__ustring", 0,
-                                   SectionKind::getMergeable2ByteCString());
-  FourByteConstantSection // .literal4
-    = getContext().getMachOSection("__TEXT", "__literal4",
-                                   MCSectionMachO::S_4BYTE_LITERALS,
-                                   SectionKind::getMergeableConst4());
-  EightByteConstantSection // .literal8
-    = getContext().getMachOSection("__TEXT", "__literal8",
-                                   MCSectionMachO::S_8BYTE_LITERALS,
-                                   SectionKind::getMergeableConst8());
-
-  // ld_classic doesn't support .literal16 in 32-bit mode, and ld64 falls back
-  // to using it in -static mode.
-  SixteenByteConstantSection = 0;
-  if (TM.getRelocationModel() != Reloc::Static &&
-      TM.getTargetData()->getPointerSize() == 32)
-    SixteenByteConstantSection =   // .literal16
-      getContext().getMachOSection("__TEXT", "__literal16",
-                                   MCSectionMachO::S_16BYTE_LITERALS,
-                                   SectionKind::getMergeableConst16());
-
-  ReadOnlySection  // .const
-    = getContext().getMachOSection("__TEXT", "__const", 0,
-                                   SectionKind::getReadOnly());
-
-  TextCoalSection
-    = getContext().getMachOSection("__TEXT", "__textcoal_nt",
-                                   MCSectionMachO::S_COALESCED |
-                                   MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
-                                   SectionKind::getText());
-  ConstTextCoalSection
-    = getContext().getMachOSection("__TEXT", "__const_coal",
-                                   MCSectionMachO::S_COALESCED,
-                                   SectionKind::getReadOnly());
-  ConstDataSection  // .const_data
-    = getContext().getMachOSection("__DATA", "__const", 0,
-                                   SectionKind::getReadOnlyWithRel());
-  DataCoalSection
-    = getContext().getMachOSection("__DATA","__datacoal_nt",
-                                   MCSectionMachO::S_COALESCED,
-                                   SectionKind::getDataRel());
-  DataCommonSection
-    = getContext().getMachOSection("__DATA","__common",
-                                   MCSectionMachO::S_ZEROFILL,
-                                   SectionKind::getBSS());
-  DataBSSSection
-    = getContext().getMachOSection("__DATA","__bss", MCSectionMachO::S_ZEROFILL,
-                                   SectionKind::getBSS());
-
-
-  LazySymbolPointerSection
-    = getContext().getMachOSection("__DATA", "__la_symbol_ptr",
-                                   MCSectionMachO::S_LAZY_SYMBOL_POINTERS,
-                                   SectionKind::getMetadata());
-  NonLazySymbolPointerSection
-    = getContext().getMachOSection("__DATA", "__nl_symbol_ptr",
-                                   MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
-                                   SectionKind::getMetadata());
-
-  if (TM.getRelocationModel() == Reloc::Static) {
-    StaticCtorSection
-      = getContext().getMachOSection("__TEXT", "__constructor", 0,
-                                     SectionKind::getDataRel());
-    StaticDtorSection
-      = getContext().getMachOSection("__TEXT", "__destructor", 0,
-                                     SectionKind::getDataRel());
-  } else {
-    StaticCtorSection
-      = getContext().getMachOSection("__DATA", "__mod_init_func",
-                                     MCSectionMachO::S_MOD_INIT_FUNC_POINTERS,
-                                     SectionKind::getDataRel());
-    StaticDtorSection
-      = getContext().getMachOSection("__DATA", "__mod_term_func",
-                                     MCSectionMachO::S_MOD_TERM_FUNC_POINTERS,
-                                     SectionKind::getDataRel());
-  }
-
-  // Exception Handling.
-  LSDASection = getContext().getMachOSection("__TEXT", "__gcc_except_tab", 0,
-                                             SectionKind::getReadOnlyWithRel());
-
-  if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
-    CompactUnwindSection =
-      getContext().getMachOSection("__LD", "__compact_unwind",
-                                   MCSectionMachO::S_ATTR_DEBUG,
-                                   SectionKind::getReadOnly());
-
-  // Debug Information.
-  DwarfAbbrevSection =
-    getContext().getMachOSection("__DWARF", "__debug_abbrev",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfInfoSection =
-    getContext().getMachOSection("__DWARF", "__debug_info",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfLineSection =
-    getContext().getMachOSection("__DWARF", "__debug_line",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfFrameSection =
-    getContext().getMachOSection("__DWARF", "__debug_frame",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfPubNamesSection =
-    getContext().getMachOSection("__DWARF", "__debug_pubnames",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfPubTypesSection =
-    getContext().getMachOSection("__DWARF", "__debug_pubtypes",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfStrSection =
-    getContext().getMachOSection("__DWARF", "__debug_str",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfLocSection =
-    getContext().getMachOSection("__DWARF", "__debug_loc",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfARangesSection =
-    getContext().getMachOSection("__DWARF", "__debug_aranges",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfRangesSection =
-    getContext().getMachOSection("__DWARF", "__debug_ranges",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfMacroInfoSection =
-    getContext().getMachOSection("__DWARF", "__debug_macinfo",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-  DwarfDebugInlineSection =
-    getContext().getMachOSection("__DWARF", "__debug_inlined",
-                                 MCSectionMachO::S_ATTR_DEBUG,
-                                 SectionKind::getMetadata());
-
-  TLSExtraDataSection = TLSTLVSection;
-}
-
-const MCSection *TargetLoweringObjectFileMachO::getEHFrameSection() const {
-  return getContext().getMachOSection("__TEXT", "__eh_frame",
-                                      MCSectionMachO::S_COALESCED |
-                                      MCSectionMachO::S_ATTR_NO_TOC |
-                                      MCSectionMachO::S_ATTR_STRIP_STATIC_SYMS |
-                                      MCSectionMachO::S_ATTR_LIVE_SUPPORT,
-                                      SectionKind::getReadOnly());
-}
-
 const MCSection *TargetLoweringObjectFileMachO::
 getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
                          Mangler *Mang, const TargetMachine &TM) const {
@@ -905,183 +543,10 @@ getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
   return SSym;
 }
 
-unsigned TargetLoweringObjectFileMachO::getPersonalityEncoding() const {
-  return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
-unsigned TargetLoweringObjectFileMachO::getLSDAEncoding() const {
-  return DW_EH_PE_pcrel;
-}
-
-unsigned TargetLoweringObjectFileMachO::getFDEEncoding(bool CFI) const {
-  return DW_EH_PE_pcrel;
-}
-
-unsigned TargetLoweringObjectFileMachO::getTTypeEncoding() const {
-  return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-}
-
 //===----------------------------------------------------------------------===//
 //                                  COFF
 //===----------------------------------------------------------------------===//
 
-TargetLoweringObjectFileCOFF::TargetLoweringObjectFileCOFF()
-  : TargetLoweringObjectFile(),
-    DrectveSection(0),
-    PDataSection(0),
-    XDataSection(0) {
-}
-
-void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
-                                              const TargetMachine &TM) {
-  TargetLoweringObjectFile::Initialize(Ctx, TM);
-  TextSection =
-    getContext().getCOFFSection(".text",
-                                COFF::IMAGE_SCN_CNT_CODE |
-                                COFF::IMAGE_SCN_MEM_EXECUTE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getText());
-  DataSection =
-    getContext().getCOFFSection(".data",
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
-                                SectionKind::getDataRel());
-  ReadOnlySection =
-    getContext().getCOFFSection(".rdata",
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getReadOnly());
-  StaticCtorSection =
-    getContext().getCOFFSection(".ctors",
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
-                                SectionKind::getDataRel());
-  StaticDtorSection =
-    getContext().getCOFFSection(".dtors",
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
-                                SectionKind::getDataRel());
-
-  // FIXME: We're emitting LSDA info into a readonly section on COFF, even
-  // though it contains relocatable pointers.  In PIC mode, this is probably a
-  // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
-  // adjusted or this should be a data section.
-  LSDASection =
-    getContext().getCOFFSection(".gcc_except_table",
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getReadOnly());
-  // Debug info.
-  DwarfAbbrevSection =
-    getContext().getCOFFSection(".debug_abbrev",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfInfoSection =
-    getContext().getCOFFSection(".debug_info",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfLineSection =
-    getContext().getCOFFSection(".debug_line",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfFrameSection =
-    getContext().getCOFFSection(".debug_frame",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfPubNamesSection =
-    getContext().getCOFFSection(".debug_pubnames",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfPubTypesSection =
-    getContext().getCOFFSection(".debug_pubtypes",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfStrSection =
-    getContext().getCOFFSection(".debug_str",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfLocSection =
-    getContext().getCOFFSection(".debug_loc",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfARangesSection =
-    getContext().getCOFFSection(".debug_aranges",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfRangesSection =
-    getContext().getCOFFSection(".debug_ranges",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-  DwarfMacroInfoSection =
-    getContext().getCOFFSection(".debug_macinfo",
-                                COFF::IMAGE_SCN_MEM_DISCARDABLE |
-                                COFF::IMAGE_SCN_MEM_READ,
-                                SectionKind::getMetadata());
-
-  DrectveSection =
-    getContext().getCOFFSection(".drectve",
-                                COFF::IMAGE_SCN_LNK_INFO,
-                                SectionKind::getMetadata());
-
-  PDataSection =
-    getContext().getCOFFSection(".pdata",
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
-                                SectionKind::getDataRel());
-
-  XDataSection =
-    getContext().getCOFFSection(".xdata",
-                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                COFF::IMAGE_SCN_MEM_READ |
-                                COFF::IMAGE_SCN_MEM_WRITE,
-                                SectionKind::getDataRel());
-}
-
-const MCSection *TargetLoweringObjectFileCOFF::getEHFrameSection() const {
-  return getContext().getCOFFSection(".eh_frame",
-                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                     COFF::IMAGE_SCN_MEM_READ |
-                                     COFF::IMAGE_SCN_MEM_WRITE,
-                                     SectionKind::getDataRel());
-}
-
-const MCSection *TargetLoweringObjectFileCOFF::getWin64EHFuncTableSection(
-                                                       StringRef suffix) const {
-  if (suffix == "")
-    return PDataSection;
-  return getContext().getCOFFSection((".pdata"+suffix).str(),
-                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                     COFF::IMAGE_SCN_MEM_READ |
-                                     COFF::IMAGE_SCN_MEM_WRITE,
-                                     SectionKind::getDataRel());
-}
-
-const MCSection *TargetLoweringObjectFileCOFF::getWin64EHTableSection(
-                                                       StringRef suffix) const {
-  if (suffix == "")
-    return XDataSection;
-  return getContext().getCOFFSection((".xdata"+suffix).str(),
-                                     COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                                     COFF::IMAGE_SCN_MEM_READ |
-                                     COFF::IMAGE_SCN_MEM_WRITE,
-                                     SectionKind::getDataRel());
-}
-
-
 static unsigned
 getCOFFSectionFlags(SectionKind K) {
   unsigned Flags = 0;
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 6d6244e..d879378 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -177,6 +177,10 @@ char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
 bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
                                            MachineInstr *MI, unsigned SavedReg,
                                            MachineBasicBlock::iterator OldPos) {
+  // FIXME: Shouldn't we be trying to do this before we three-addressify the
+  // instruction?  After this transformation is done, we no longer need
+  // the instruction to be in three-address form.
+
   // Check if it's safe to move this instruction.
   bool SeenStore = true; // Be conservative.
   if (!MI->isSafeToMove(TII, AA, SeenStore))
@@ -217,7 +221,11 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
     break;
   }
 
-  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI)
+  // If we find the instruction that kills SavedReg, and it is in an
+  // appropriate location, we can try to sink the current instruction
+  // past it.
+  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI ||
+      KillMI->getDesc().isTerminator())
     return false;
 
   // If any of the definitions are used by another instruction between the
@@ -1041,6 +1049,9 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** Function: " 
         << MF.getFunction()->getName() << '\n');
 
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
   // ReMatRegs - Keep track of the registers whose def's are remat'ed.
   BitVector ReMatRegs(MRI->getNumVirtRegs());
 
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 7557979..8a1cdc0 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -41,8 +41,8 @@
 #include <algorithm>
 using namespace llvm;
 
-STATISTIC(NumSpills  , "Number of register spills");
-STATISTIC(NumIdCopies, "Number of identity moves eliminated after rewriting");
+STATISTIC(NumSpillSlots, "Number of spill slots allocated");
+STATISTIC(NumIdCopies,   "Number of identity moves eliminated after rewriting");
 
 //===----------------------------------------------------------------------===//
 //  VirtRegMap implementation
@@ -111,6 +111,7 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
   unsigned Idx = SS-LowSpillSlot;
   while (Idx >= SpillSlotToUsesMap.size())
     SpillSlotToUsesMap.resize(SpillSlotToUsesMap.size()*2);
+  ++NumSpillSlots;
   return SS;
 }
 
@@ -130,7 +131,6 @@ int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
   assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
          "attempt to assign stack slot to already spilled register");
   const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
-  ++NumSpills;
   return Virt2StackSlotMap[virtReg] = createSpillSlot(RC);
 }
 
@@ -285,14 +285,24 @@ void VirtRegMap::rewrite(SlotIndexes *Indexes) {
         // Preserve semantics of sub-register operands.
         if (MO.getSubReg()) {
           // A virtual register kill refers to the whole register, so we may
-          // have to add <imp-use,kill> operands for the super-register.
-          if (MO.isUse()) {
-            if (MO.isKill() && !MO.isUndef())
-              SuperKills.push_back(PhysReg);
-          } else if (MO.isDead())
-            SuperDeads.push_back(PhysReg);
-          else
-            SuperDefs.push_back(PhysReg);
+          // have to add <imp-use,kill> operands for the super-register.  A
+          // partial redef always kills and redefines the super-register.
+          if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+            SuperKills.push_back(PhysReg);
+
+          if (MO.isDef()) {
+            // The <def,undef> flag only makes sense for sub-register defs, and
+            // we are substituting a full physreg.  An <imp-use,kill> operand
+            // from the SuperKills list will represent the partial read of the
+            // super-register.
+            MO.setIsUndef(false);
+
+            // Also add implicit defs for the super-register.
+            if (MO.isDead())
+              SuperDeads.push_back(PhysReg);
+            else
+              SuperDefs.push_back(PhysReg);
+          }
 
           // PhysReg operands cannot have subregister indexes.
           PhysReg = TRI->getSubReg(PhysReg, MO.getSubReg());
diff --git a/lib/CompilerDriver/Action.cpp b/lib/CompilerDriver/Action.cpp
deleted file mode 100644
index a8d625c..0000000
--- a/lib/CompilerDriver/Action.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-//===--- Action.cpp - The LLVM Compiler Driver ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open
-// Source License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  Action class - implementation and auxiliary functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CompilerDriver/Action.h"
-#include "llvm/CompilerDriver/BuiltinOptions.h"
-#include "llvm/CompilerDriver/Error.h"
-#include "llvm/CompilerDriver/Main.h"
-
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/SystemUtils.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/TimeValue.h"
-
-#include <stdexcept>
-#include <string>
-
-using namespace llvm;
-using namespace llvmc;
-
-namespace llvmc {
-
-extern const char* ProgramName;
-
-}
-
-namespace {
-
-  void PrintString (const std::string& str) {
-    errs() << str << ' ';
-  }
-
-  void PrintCommand (const std::string& Cmd, const StrVector& Args) {
-    errs() << Cmd << ' ';
-    std::for_each(Args.begin(), Args.end(), &PrintString);
-    errs() << '\n';
-  }
-
-  bool IsSegmentationFault (int returnCode) {
-#ifdef LLVM_ON_WIN32
-    return (returnCode >= 0xc0000000UL)
-#else
-    return (returnCode < 0);
-#endif
-  }
-
-  int ExecuteProgram (const std::string& name, const StrVector& args) {
-    sys::Path prog(name);
-
-    if (sys::path::is_relative(prog.str())) {
-      prog = PrependMainExecutablePath(name, ProgramName,
-                                       (void *)(intptr_t)&Main);
-
-      if (!prog.canExecute()) {
-        prog = sys::Program::FindProgramByName(name);
-        if (prog.isEmpty()) {
-          PrintError("Can't find program '" + name + "'");
-          return -1;
-        }
-      }
-    }
-    if (!prog.canExecute()) {
-      PrintError("Program '" + name + "' is not executable.");
-      return -1;
-    }
-
-    // Build the command line vector and the redirects array.
-    const sys::Path* redirects[3] = {0,0,0};
-    sys::Path stdout_redirect;
-
-    std::vector<const char*> argv;
-    argv.reserve((args.size()+2));
-    argv.push_back(name.c_str());
-
-    for (StrVector::const_iterator B = args.begin(), E = args.end();
-         B!=E; ++B) {
-      if (*B == ">") {
-        ++B;
-        stdout_redirect.set(*B);
-        redirects[1] = &stdout_redirect;
-      }
-      else {
-        argv.push_back((*B).c_str());
-      }
-    }
-    argv.push_back(0);  // null terminate list.
-
-    // Invoke the program.
-    int ret = sys::Program::ExecuteAndWait(prog, &argv[0], 0, &redirects[0]);
-
-    if (IsSegmentationFault(ret)) {
-      errs() << "Segmentation fault: ";
-      PrintCommand(name, args);
-    }
-
-    return ret;
-  }
-}
-
-namespace llvmc {
-  void AppendToGlobalTimeLog (const std::string& cmd, double time);
-}
-
-int llvmc::Action::Execute () const {
-  if (DryRun || VerboseMode)
-    PrintCommand(Command_, Args_);
-
-  if (!DryRun) {
-    if (Time) {
-      sys::TimeValue now = sys::TimeValue::now();
-      int ret = ExecuteProgram(Command_, Args_);
-      sys::TimeValue now2 = sys::TimeValue::now();
-      now2 -= now;
-      double elapsed = now2.seconds()  + now2.microseconds()  / 1000000.0;
-      AppendToGlobalTimeLog(Command_, elapsed);
-
-      return ret;
-    }
-    else {
-      return ExecuteProgram(Command_, Args_);
-    }
-  }
-
-  return 0;
-}
diff --git a/lib/CompilerDriver/BuiltinOptions.cpp b/lib/CompilerDriver/BuiltinOptions.cpp
deleted file mode 100644
index 3844203..0000000
--- a/lib/CompilerDriver/BuiltinOptions.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-//===--- BuiltinOptions.cpp - The LLVM Compiler Driver ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open
-// Source License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  Definitions of all global command-line option variables.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CompilerDriver/BuiltinOptions.h"
-
-#ifdef ENABLE_LLVMC_DYNAMIC_PLUGINS
-#include "llvm/Support/PluginLoader.h"
-#endif
-
-namespace cl = llvm::cl;
-
-namespace llvmc {
-
-cl::list<std::string> InputFilenames(cl::Positional, cl::desc("<input file>"),
-                                     cl::ZeroOrMore);
-cl::opt<std::string> OutputFilename("o", cl::desc("Output file name"),
-                                    cl::value_desc("file"), cl::Prefix);
-cl::opt<std::string> TempDirname("temp-dir", cl::desc("Temp dir name"),
-                                 cl::value_desc("<directory>"), cl::Prefix);
-cl::list<std::string> Languages("x",
-          cl::desc("Specify the language of the following input files"),
-          cl::ZeroOrMore);
-
-cl::opt<bool> DryRun("dry-run",
-                     cl::desc("Only pretend to run commands"));
-cl::opt<bool> Time("time", cl::desc("Time individual commands"));
-cl::opt<bool> VerboseMode("v",
-                          cl::desc("Enable verbose mode"));
-
-cl::opt<bool> CheckGraph("check-graph",
-                         cl::desc("Check the compilation graph for errors"),
-                         cl::Hidden);
-cl::opt<bool> WriteGraph("write-graph",
-                         cl::desc("Write compilation-graph.dot file"),
-                         cl::Hidden);
-cl::opt<bool> ViewGraph("view-graph",
-                         cl::desc("Show compilation graph in GhostView"),
-                         cl::Hidden);
-
-cl::opt<SaveTempsEnum::Values> SaveTemps
-("save-temps", cl::desc("Keep temporary files"),
- cl::init(SaveTempsEnum::Unset),
- cl::values(clEnumValN(SaveTempsEnum::Obj, "obj",
-                       "Save files in the directory specified with -o"),
-            clEnumValN(SaveTempsEnum::Cwd, "cwd",
-                       "Use current working directory"),
-            clEnumValN(SaveTempsEnum::Obj, "", "Same as 'cwd'"),
-            clEnumValEnd),
- cl::ValueOptional);
-
-}  // End namespace llvmc.
diff --git a/lib/CompilerDriver/CMakeLists.txt b/lib/CompilerDriver/CMakeLists.txt
deleted file mode 100644
index a12b337..0000000
--- a/lib/CompilerDriver/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-set(LLVM_LINK_COMPONENTS support)
-
-# We don't want this library to appear in `llvm-config --libs` output,
-# so its name doesn't start with "LLVM".
-
-add_llvm_library(CompilerDriver
-  Action.cpp
-  BuiltinOptions.cpp
-  CompilationGraph.cpp
-  Main.cpp
-  Tool.cpp
-  )
diff --git a/lib/CompilerDriver/CompilationGraph.cpp b/lib/CompilerDriver/CompilationGraph.cpp
deleted file mode 100644
index 33c6566..0000000
--- a/lib/CompilerDriver/CompilationGraph.cpp
+++ /dev/null
@@ -1,655 +0,0 @@
-//===--- CompilationGraph.cpp - The LLVM Compiler Driver --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open
-// Source License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  Compilation graph - implementation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CompilerDriver/BuiltinOptions.h"
-#include "llvm/CompilerDriver/CompilationGraph.h"
-#include "llvm/CompilerDriver/Error.h"
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/DOTGraphTraits.h"
-#include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <algorithm>
-#include <cstring>
-#include <iterator>
-#include <limits>
-#include <queue>
-
-using namespace llvm;
-using namespace llvmc;
-
-namespace llvmc {
-
-  const std::string* LanguageMap::GetLanguage(const sys::Path& File) const {
-    // Remove the '.'.
-    StringRef suf = sys::path::extension(File.str()).substr(1);
-    LanguageMap::const_iterator Lang =
-      this->find(suf.empty() ? "*empty*" : suf);
-    if (Lang == this->end()) {
-      PrintError("File '" + File.str() + "' has unknown suffix '"
-                 + suf.str() + '\'');
-      return 0;
-    }
-    return &Lang->second;
-  }
-}
-
-namespace {
-
-  /// ChooseEdge - Return the edge with the maximum weight. Returns 0 on error.
-  template <class C>
-  const Edge* ChooseEdge(const C& EdgesContainer,
-                         const InputLanguagesSet& InLangs,
-                         const std::string& NodeName = "root") {
-    const Edge* MaxEdge = 0;
-    int MaxWeight = 0;
-    bool SingleMax = true;
-
-    // TODO: fix calculation of SingleMax.
-    for (typename C::const_iterator B = EdgesContainer.begin(),
-           E = EdgesContainer.end(); B != E; ++B) {
-      const Edge* e = B->getPtr();
-      int EW = e->Weight(InLangs);
-      if (EW < 0) {
-        // (error) invocation in TableGen -> we don't need to print an error
-        // message.
-        return 0;
-      }
-      if (EW > MaxWeight) {
-        MaxEdge = e;
-        MaxWeight = EW;
-        SingleMax = true;
-      } else if (EW == MaxWeight) {
-        SingleMax = false;
-      }
-    }
-
-    if (!SingleMax) {
-      PrintError("Node " + NodeName + ": multiple maximal outward edges found!"
-                 " Most probably a specification error.");
-      return 0;
-    }
-    if (!MaxEdge) {
-      PrintError("Node " + NodeName + ": no maximal outward edge found!"
-                 " Most probably a specification error.");
-      return 0;
-    }
-    return MaxEdge;
-  }
-
-}
-
-void Node::AddEdge(Edge* Edg) {
-  // If there already was an edge between two nodes, modify it instead
-  // of adding a new edge.
-  const std::string& ToolName = Edg->ToolName();
-  for (container_type::iterator B = OutEdges.begin(), E = OutEdges.end();
-       B != E; ++B) {
-    if ((*B)->ToolName() == ToolName) {
-      llvm::IntrusiveRefCntPtr<Edge>(Edg).swap(*B);
-      return;
-    }
-  }
-  OutEdges.push_back(llvm::IntrusiveRefCntPtr<Edge>(Edg));
-}
-
-CompilationGraph::CompilationGraph() {
-  NodesMap["root"] = Node(this);
-}
-
-Node* CompilationGraph::getNode(const std::string& ToolName) {
-  nodes_map_type::iterator I = NodesMap.find(ToolName);
-  if (I == NodesMap.end()) {
-    PrintError("Node " + ToolName + " is not in the graph");
-    return 0;
-  }
-  return &I->second;
-}
-
-const Node* CompilationGraph::getNode(const std::string& ToolName) const {
-  nodes_map_type::const_iterator I = NodesMap.find(ToolName);
-  if (I == NodesMap.end()) {
-    PrintError("Node " + ToolName + " is not in the graph!");
-    return 0;
-  }
-  return &I->second;
-}
-
-// Find the tools list corresponding to the given language name.
-const CompilationGraph::tools_vector_type*
-CompilationGraph::getToolsVector(const std::string& LangName) const
-{
-  tools_map_type::const_iterator I = ToolsMap.find(LangName);
-  if (I == ToolsMap.end()) {
-    PrintError("No tool corresponding to the language " + LangName + " found");
-    return 0;
-  }
-  return &I->second;
-}
-
-void CompilationGraph::insertNode(Tool* V) {
-  if (NodesMap.count(V->Name()) == 0)
-    NodesMap[V->Name()] = Node(this, V);
-}
-
-int CompilationGraph::insertEdge(const std::string& A, Edge* Edg) {
-  Node* B = getNode(Edg->ToolName());
-  if (B == 0)
-    return 1;
-
-  if (A == "root") {
-    const char** InLangs = B->ToolPtr->InputLanguages();
-    for (;*InLangs; ++InLangs)
-      ToolsMap[*InLangs].push_back(IntrusiveRefCntPtr<Edge>(Edg));
-    NodesMap["root"].AddEdge(Edg);
-  }
-  else {
-    Node* N = getNode(A);
-    if (N == 0)
-      return 1;
-
-    N->AddEdge(Edg);
-  }
-  // Increase the inward edge counter.
-  B->IncrInEdges();
-
-  return 0;
-}
-
-// Pass input file through the chain until we bump into a Join node or
-// a node that says that it is the last.
-int CompilationGraph::PassThroughGraph (const sys::Path& InFile,
-                                        const Node* StartNode,
-                                        const InputLanguagesSet& InLangs,
-                                        const sys::Path& TempDir,
-                                        const LanguageMap& LangMap) const {
-  sys::Path In = InFile;
-  const Node* CurNode = StartNode;
-
-  while(true) {
-    Tool* CurTool = CurNode->ToolPtr.getPtr();
-
-    if (CurTool->IsJoin()) {
-      JoinTool& JT = static_cast<JoinTool&>(*CurTool);
-      JT.AddToJoinList(In);
-      break;
-    }
-
-    Action CurAction;
-    if (int ret = CurTool->GenerateAction(CurAction, In, CurNode->HasChildren(),
-                                          TempDir, InLangs, LangMap)) {
-      return ret;
-    }
-
-    if (int ret = CurAction.Execute())
-      return ret;
-
-    if (CurAction.StopCompilation())
-      return 0;
-
-    const Edge* Edg = ChooseEdge(CurNode->OutEdges, InLangs, CurNode->Name());
-    if (Edg == 0)
-      return 1;
-
-    CurNode = getNode(Edg->ToolName());
-    if (CurNode == 0)
-      return 1;
-
-    In = CurAction.OutFile();
-  }
-
-  return 0;
-}
-
-// Find the head of the toolchain corresponding to the given file.
-// Also, insert an input language into InLangs.
-const Node* CompilationGraph::
-FindToolChain(const sys::Path& In, const std::string* ForceLanguage,
-              InputLanguagesSet& InLangs, const LanguageMap& LangMap) const {
-
-  // Determine the input language.
-  const std::string* InLang = (ForceLanguage ? ForceLanguage
-                               : LangMap.GetLanguage(In));
-  if (InLang == 0)
-    return 0;
-  const std::string& InLanguage = *InLang;
-
-  // Add the current input language to the input language set.
-  InLangs.insert(InLanguage);
-
-  // Find the toolchain for the input language.
-  const tools_vector_type* pTV = getToolsVector(InLanguage);
-  if (pTV == 0)
-    return 0;
-
-  const tools_vector_type& TV = *pTV;
-  if (TV.empty()) {
-    PrintError("No toolchain corresponding to language "
-               + InLanguage + " found");
-    return 0;
-  }
-
-  const Edge* Edg = ChooseEdge(TV, InLangs);
-  if (Edg == 0)
-    return 0;
-
-  return getNode(Edg->ToolName());
-}
-
-// Helper function used by Build().
-// Traverses initial portions of the toolchains (up to the first Join node).
-// This function is also responsible for handling the -x option.
-int CompilationGraph::BuildInitial (InputLanguagesSet& InLangs,
-                                    const sys::Path& TempDir,
-                                    const LanguageMap& LangMap) {
-  // This is related to -x option handling.
-  cl::list<std::string>::const_iterator xIter = Languages.begin(),
-    xBegin = xIter, xEnd = Languages.end();
-  bool xEmpty = true;
-  const std::string* xLanguage = 0;
-  unsigned xPos = 0, xPosNext = 0, filePos = 0;
-
-  if (xIter != xEnd) {
-    xEmpty = false;
-    xPos = Languages.getPosition(xIter - xBegin);
-    cl::list<std::string>::const_iterator xNext = llvm::next(xIter);
-    xPosNext = (xNext == xEnd) ? std::numeric_limits<unsigned>::max()
-      : Languages.getPosition(xNext - xBegin);
-    xLanguage = (*xIter == "none") ? 0 : &(*xIter);
-  }
-
-  // For each input file:
-  for (cl::list<std::string>::const_iterator B = InputFilenames.begin(),
-         CB = B, E = InputFilenames.end(); B != E; ++B) {
-    sys::Path In = sys::Path(*B);
-
-    // Code for handling the -x option.
-    // Output: std::string* xLanguage (can be NULL).
-    if (!xEmpty) {
-      filePos = InputFilenames.getPosition(B - CB);
-
-      if (xPos < filePos) {
-        if (filePos < xPosNext) {
-          xLanguage = (*xIter == "none") ? 0 : &(*xIter);
-        }
-        else { // filePos >= xPosNext
-          // Skip xIters while filePos > xPosNext
-          while (filePos > xPosNext) {
-            ++xIter;
-            xPos = xPosNext;
-
-            cl::list<std::string>::const_iterator xNext = llvm::next(xIter);
-            if (xNext == xEnd)
-              xPosNext = std::numeric_limits<unsigned>::max();
-            else
-              xPosNext = Languages.getPosition(xNext - xBegin);
-            xLanguage = (*xIter == "none") ? 0 : &(*xIter);
-          }
-        }
-      }
-    }
-
-    // Find the toolchain corresponding to this file.
-    const Node* N = FindToolChain(In, xLanguage, InLangs, LangMap);
-    if (N == 0)
-      return 1;
-    // Pass file through the chain starting at head.
-    if (int ret = PassThroughGraph(In, N, InLangs, TempDir, LangMap))
-      return ret;
-  }
-
-  return 0;
-}
-
-// Sort the nodes in topological order.
-int CompilationGraph::TopologicalSort(std::vector<const Node*>& Out) {
-  std::queue<const Node*> Q;
-
-  Node* Root = getNode("root");
-  if (Root == 0)
-    return 1;
-
-  Q.push(Root);
-
-  while (!Q.empty()) {
-    const Node* A = Q.front();
-    Q.pop();
-    Out.push_back(A);
-    for (Node::const_iterator EB = A->EdgesBegin(), EE = A->EdgesEnd();
-         EB != EE; ++EB) {
-      Node* B = getNode((*EB)->ToolName());
-      if (B == 0)
-        return 1;
-
-      B->DecrInEdges();
-      if (B->HasNoInEdges())
-        Q.push(B);
-    }
-  }
-
-  return 0;
-}
-
-namespace {
-  bool NotJoinNode(const Node* N) {
-    return N->ToolPtr ? !N->ToolPtr->IsJoin() : true;
-  }
-}
-
-// Call TopologicalSort and filter the resulting list to include
-// only Join nodes.
-int CompilationGraph::
-TopologicalSortFilterJoinNodes(std::vector<const Node*>& Out) {
-  std::vector<const Node*> TopSorted;
-  if (int ret = TopologicalSort(TopSorted))
-    return ret;
-  std::remove_copy_if(TopSorted.begin(), TopSorted.end(),
-                      std::back_inserter(Out), NotJoinNode);
-
-  return 0;
-}
-
-int CompilationGraph::Build (const sys::Path& TempDir,
-                             const LanguageMap& LangMap) {
-  InputLanguagesSet InLangs;
-  bool WasSomeActionGenerated = !InputFilenames.empty();
-
-  // Traverse initial parts of the toolchains and fill in InLangs.
-  if (int ret = BuildInitial(InLangs, TempDir, LangMap))
-    return ret;
-
-  std::vector<const Node*> JTV;
-  if (int ret = TopologicalSortFilterJoinNodes(JTV))
-    return ret;
-
-  // For all join nodes in topological order:
-  for (std::vector<const Node*>::iterator B = JTV.begin(), E = JTV.end();
-       B != E; ++B) {
-
-    const Node* CurNode = *B;
-    JoinTool* JT = &static_cast<JoinTool&>(*CurNode->ToolPtr.getPtr());
-
-    // Are there any files in the join list?
-    if (JT->JoinListEmpty() && !(JT->WorksOnEmpty() && InputFilenames.empty()))
-      continue;
-
-    WasSomeActionGenerated = true;
-    Action CurAction;
-    if (int ret = JT->GenerateAction(CurAction, CurNode->HasChildren(),
-                                     TempDir, InLangs, LangMap)) {
-      return ret;
-    }
-
-    if (int ret = CurAction.Execute())
-      return ret;
-
-    if (CurAction.StopCompilation())
-      return 0;
-
-    const Edge* Edg = ChooseEdge(CurNode->OutEdges, InLangs, CurNode->Name());
-    if (Edg == 0)
-      return 1;
-
-    const Node* NextNode = getNode(Edg->ToolName());
-    if (NextNode == 0)
-      return 1;
-
-    if (int ret = PassThroughGraph(sys::Path(CurAction.OutFile()), NextNode,
-                                   InLangs, TempDir, LangMap)) {
-      return ret;
-    }
-  }
-
-  if (!WasSomeActionGenerated) {
-    PrintError("no input files");
-    return 1;
-  }
-
-  return 0;
-}
-
-int CompilationGraph::CheckLanguageNames() const {
-  int ret = 0;
-
-  // Check that names for output and input languages on all edges do match.
-  for (const_nodes_iterator B = this->NodesMap.begin(),
-         E = this->NodesMap.end(); B != E; ++B) {
-
-    const Node & N1 = B->second;
-    if (N1.ToolPtr) {
-      for (Node::const_iterator EB = N1.EdgesBegin(), EE = N1.EdgesEnd();
-           EB != EE; ++EB) {
-        const Node* N2 = this->getNode((*EB)->ToolName());
-        if (N2 == 0)
-          return 1;
-
-        if (!N2->ToolPtr) {
-          ++ret;
-          errs() << "Error: there is an edge from '" << N1.ToolPtr->Name()
-                 << "' back to the root!\n\n";
-          continue;
-        }
-
-        const char** OutLangs = N1.ToolPtr->OutputLanguages();
-        const char** InLangs = N2->ToolPtr->InputLanguages();
-        bool eq = false;
-        const char* OutLang = 0;
-        for (;*OutLangs; ++OutLangs) {
-          OutLang = *OutLangs;
-          for (;*InLangs; ++InLangs) {
-            if (std::strcmp(OutLang, *InLangs) == 0) {
-              eq = true;
-              break;
-            }
-          }
-        }
-
-        if (!eq) {
-          ++ret;
-          errs() << "Error: Output->input language mismatch in the edge '"
-                 << N1.ToolPtr->Name() << "' -> '" << N2->ToolPtr->Name()
-                 << "'!\n"
-                 << "Expected one of { ";
-
-          InLangs = N2->ToolPtr->InputLanguages();
-          for (;*InLangs; ++InLangs) {
-            errs() << '\'' << *InLangs << (*(InLangs+1) ? "', " : "'");
-          }
-
-          errs() << " }, but got '" << OutLang << "'!\n\n";
-        }
-
-      }
-    }
-  }
-
-  return ret;
-}
-
-int CompilationGraph::CheckMultipleDefaultEdges() const {
-  int ret = 0;
-  InputLanguagesSet Dummy;
-
-  // For all nodes, just iterate over the outgoing edges and check if there is
-  // more than one edge with maximum weight.
-  for (const_nodes_iterator B = this->NodesMap.begin(),
-         E = this->NodesMap.end(); B != E; ++B) {
-    const Node& N = B->second;
-    int MaxWeight = -1024;
-
-    // Ignore the root node.
-    if (!N.ToolPtr)
-      continue;
-
-    for (Node::const_iterator EB = N.EdgesBegin(), EE = N.EdgesEnd();
-         EB != EE; ++EB) {
-      int EdgeWeight = (*EB)->Weight(Dummy);
-      if (EdgeWeight > MaxWeight) {
-        MaxWeight = EdgeWeight;
-      }
-      else if (EdgeWeight == MaxWeight) {
-        ++ret;
-        errs() << "Error: there are multiple maximal edges stemming from the '"
-               << N.ToolPtr->Name() << "' node!\n\n";
-        break;
-      }
-    }
-  }
-
-  return ret;
-}
-
-int CompilationGraph::CheckCycles() {
-  unsigned deleted = 0;
-  std::queue<Node*> Q;
-
-  Node* Root = getNode("root");
-  if (Root == 0)
-    return 1;
-
-  Q.push(Root);
-
-  // Try to delete all nodes that have no ingoing edges, starting from the
-  // root. If there are any nodes left after this operation, then we have a
-  // cycle. This relies on '--check-graph' not performing the topological sort.
-  while (!Q.empty()) {
-    Node* A = Q.front();
-    Q.pop();
-    ++deleted;
-
-    for (Node::iterator EB = A->EdgesBegin(), EE = A->EdgesEnd();
-         EB != EE; ++EB) {
-      Node* B = getNode((*EB)->ToolName());
-      if (B == 0)
-        return 1;
-
-      B->DecrInEdges();
-      if (B->HasNoInEdges())
-        Q.push(B);
-    }
-  }
-
-  if (deleted != NodesMap.size()) {
-    errs() << "Error: there are cycles in the compilation graph!\n"
-           << "Try inspecting the diagram produced by "
-           << "'llvmc --view-graph'.\n\n";
-    return 1;
-  }
-
-  return 0;
-}
-
-int CompilationGraph::Check () {
-  // We try to catch as many errors as we can in one go.
-  int errs = 0;
-  int ret = 0;
-
-  // Check that output/input language names match.
-  ret = this->CheckLanguageNames();
-  if (ret < 0)
-    return 1;
-  errs += ret;
-
-  // Check for multiple default edges.
-  ret = this->CheckMultipleDefaultEdges();
-  if (ret < 0)
-    return 1;
-  errs += ret;
-
-  // Check for cycles.
-  ret = this->CheckCycles();
-  if (ret < 0)
-    return 1;
-  errs += ret;
-
-  return errs;
-}
-
-// Code related to graph visualization.
-
-namespace {
-
-std::string SquashStrArray (const char** StrArr) {
-  std::string ret;
-
-  for (; *StrArr; ++StrArr) {
-    if (*(StrArr + 1)) {
-      ret += *StrArr;
-      ret +=  ", ";
-    }
-    else {
-      ret += *StrArr;
-    }
-  }
-
-  return ret;
-}
-
-} // End anonymous namespace.
-
-namespace llvm {
-  template <>
-  struct DOTGraphTraits<llvmc::CompilationGraph*>
-    : public DefaultDOTGraphTraits
-  {
-    DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
-
-    template<typename GraphType>
-    static std::string getNodeLabel(const Node* N, const GraphType&)
-    {
-      if (N->ToolPtr)
-        if (N->ToolPtr->IsJoin())
-          return N->Name() + "\n (join" +
-            (N->HasChildren() ? ")"
-             : std::string(": ") +
-             SquashStrArray(N->ToolPtr->OutputLanguages()) + ')');
-        else
-          return N->Name();
-      else
-        return "root";
-    }
-
-    template<typename EdgeIter>
-    static std::string getEdgeSourceLabel(const Node* N, EdgeIter I) {
-      if (N->ToolPtr) {
-        return SquashStrArray(N->ToolPtr->OutputLanguages());
-      }
-      else {
-        return SquashStrArray(I->ToolPtr->InputLanguages());
-      }
-    }
-  };
-
-} // End namespace llvm
-
-int CompilationGraph::writeGraph(const std::string& OutputFilename) {
-  std::string ErrorInfo;
-  raw_fd_ostream O(OutputFilename.c_str(), ErrorInfo);
-
-  if (ErrorInfo.empty()) {
-    errs() << "Writing '"<< OutputFilename << "' file...";
-    llvm::WriteGraph(O, this);
-    errs() << "done.\n";
-  }
-  else {
-    PrintError("Error opening file '" + OutputFilename + "' for writing!");
-    return 1;
-  }
-
-  return 0;
-}
-
-void CompilationGraph::viewGraph() {
-  llvm::ViewGraph(this, "compilation-graph");
-}
diff --git a/lib/CompilerDriver/Main.cpp b/lib/CompilerDriver/Main.cpp
deleted file mode 100644
index 7120027..0000000
--- a/lib/CompilerDriver/Main.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-//===--- Main.cpp - The LLVM Compiler Driver --------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open
-// Source License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  llvmc::Main function - driver entry point.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CompilerDriver/AutoGenerated.h"
-#include "llvm/CompilerDriver/BuiltinOptions.h"
-#include "llvm/CompilerDriver/CompilationGraph.h"
-#include "llvm/CompilerDriver/Error.h"
-
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/Path.h"
-
-#include <sstream>
-#include <string>
-
-namespace cl = llvm::cl;
-namespace sys = llvm::sys;
-using namespace llvmc;
-
-namespace {
-
-  std::stringstream* GlobalTimeLog;
-
-  /// GetTempDir - Get the temporary directory location. Returns non-zero value
-  /// on error.
-  int GetTempDir(sys::Path& tempDir) {
-    // The --temp-dir option.
-    if (!TempDirname.empty()) {
-      tempDir = TempDirname;
-    }
-    // GCC 4.5-style -save-temps handling.
-    else if (SaveTemps == SaveTempsEnum::Unset) {
-      tempDir = sys::Path::GetTemporaryDirectory();
-      return 0;
-    }
-    else if (SaveTemps == SaveTempsEnum::Obj && !OutputFilename.empty()) {
-      tempDir = sys::path::parent_path(OutputFilename);
-    }
-    else {
-      // SaveTemps == Cwd --> use current dir (leave tempDir empty).
-      return 0;
-    }
-
-    bool Exists;
-    if (llvm::sys::fs::exists(tempDir.str(), Exists) || !Exists) {
-      std::string ErrMsg;
-      if (tempDir.createDirectoryOnDisk(true, &ErrMsg)) {
-        PrintError(ErrMsg);
-        return 1;
-      }
-    }
-
-    return 0;
-  }
-
-  /// BuildTargets - A small wrapper for CompilationGraph::Build. Returns
-  /// non-zero value in case of error.
-  int BuildTargets(CompilationGraph& graph, const LanguageMap& langMap) {
-    int ret;
-    sys::Path tempDir;
-    bool toDelete = (SaveTemps == SaveTempsEnum::Unset);
-
-    if (int ret = GetTempDir(tempDir))
-      return ret;
-
-    ret = graph.Build(tempDir, langMap);
-
-    if (toDelete)
-      tempDir.eraseFromDisk(true);
-
-    return ret;
-  }
-}
-
-namespace llvmc {
-
-// Used to implement -time option. External linkage is intentional.
-void AppendToGlobalTimeLog(const std::string& cmd, double time) {
-  *GlobalTimeLog << "# " << cmd << ' ' << time << '\n';
-}
-
-// Sometimes user code wants to access the argv[0] value.
-const char* ProgramName;
-
-int Main(int argc, char** argv) {
-  int ret = 0;
-  LanguageMap langMap;
-  CompilationGraph graph;
-
-  ProgramName = argv[0];
-
-  cl::ParseCommandLineOptions
-    (argc, argv,
-     /* Overview = */ "LLVM Compiler Driver (Work In Progress)",
-     /* ReadResponseFiles = */ false);
-
-  if (int ret = autogenerated::RunInitialization(langMap, graph))
-    return ret;
-
-  if (CheckGraph) {
-    ret = graph.Check();
-    if (!ret)
-      llvm::errs() << "check-graph: no errors found.\n";
-
-    return ret;
-  }
-
-  if (ViewGraph) {
-    graph.viewGraph();
-    if (!WriteGraph)
-      return 0;
-  }
-
-  if (WriteGraph) {
-    const std::string& Out = (OutputFilename.empty()
-                              ? std::string("compilation-graph.dot")
-                              : OutputFilename);
-    return graph.writeGraph(Out);
-  }
-
-  if (Time) {
-    GlobalTimeLog = new std::stringstream;
-    GlobalTimeLog->precision(2);
-  }
-
-  ret = BuildTargets(graph, langMap);
-
-  if (Time) {
-    llvm::errs() << GlobalTimeLog->str();
-    delete GlobalTimeLog;
-  }
-
-  return ret;
-}
-
-} // end namespace llvmc
diff --git a/lib/CompilerDriver/Makefile b/lib/CompilerDriver/Makefile
deleted file mode 100644
index 10cfa4f..0000000
--- a/lib/CompilerDriver/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-##===- lib/CompilerDriver/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open
-# Source License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../..
-
-# We don't want this library to appear in `llvm-config --libs` output, so its
-# name doesn't start with "LLVM" and NO_LLVM_CONFIG is set.
-
-LIBRARYNAME = CompilerDriver
-LINK_COMPONENTS = support
-NO_LLVM_CONFIG = 1
-
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/CompilerDriver/Tool.cpp b/lib/CompilerDriver/Tool.cpp
deleted file mode 100644
index 876759a..0000000
--- a/lib/CompilerDriver/Tool.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-//===--- Tool.cpp - The LLVM Compiler Driver --------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open
-// Source License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  Tool base class - implementation details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CompilerDriver/BuiltinOptions.h"
-#include "llvm/CompilerDriver/Tool.h"
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Path.h"
-
-#include <algorithm>
-
-using namespace llvm;
-using namespace llvmc;
-
-namespace {
-  sys::Path MakeTempFile(const sys::Path& TempDir, const std::string& BaseName,
-                         const std::string& Suffix) {
-    sys::Path Out;
-
-    // Make sure we don't end up with path names like '/file.o' if the
-    // TempDir is empty.
-    if (TempDir.empty()) {
-      Out.set(BaseName);
-    }
-    else {
-      Out = TempDir;
-      Out.appendComponent(BaseName);
-    }
-    Out.appendSuffix(Suffix);
-    // NOTE: makeUnique always *creates* a unique temporary file,
-    // which is good, since there will be no races. However, some
-    // tools do not like it when the output file already exists, so
-    // they need to be placated with -f or something like that.
-    Out.makeUnique(true, NULL);
-    return Out;
-  }
-}
-
-sys::Path Tool::OutFilename(const sys::Path& In,
-                            const sys::Path& TempDir,
-                            bool StopCompilation,
-                            const char* OutputSuffix) const {
-  sys::Path Out;
-
-  if (StopCompilation) {
-    if (!OutputFilename.empty()) {
-      Out.set(OutputFilename);
-    }
-    else if (IsJoin()) {
-      Out.set("a");
-      Out.appendSuffix(OutputSuffix);
-    }
-    else {
-      Out.set(sys::path::stem(In.str()));
-      Out.appendSuffix(OutputSuffix);
-    }
-  }
-  else {
-    if (IsJoin())
-      Out = MakeTempFile(TempDir, "tmp", OutputSuffix);
-    else
-      Out = MakeTempFile(TempDir, sys::path::stem(In.str()), OutputSuffix);
-  }
-  return Out;
-}
-
-namespace {
-  template <class A, class B>
-  bool CompareFirst (std::pair<A,B> p1, std::pair<A,B> p2) {
-    return std::less<A>()(p1.first, p2.first);
-  }
-}
-
-StrVector Tool::SortArgs(ArgsVector& Args) const {
-  StrVector Out;
-
-  // HACK: this won't be needed when we'll migrate away from CommandLine.
-  std::stable_sort(Args.begin(), Args.end(),
-                   &CompareFirst<unsigned, std::string>);
-  for (ArgsVector::iterator B = Args.begin(), E = Args.end(); B != E; ++B) {
-    Out.push_back(B->second);
-  }
-
-  return Out;
-}
diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
new file mode 100644
index 0000000..fdffcb6
--- /dev/null
+++ b/lib/DebugInfo/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_llvm_library(LLVMDebugInfo
+  DIContext.cpp
+  DWARFAbbreviationDeclaration.cpp
+  DWARFCompileUnit.cpp
+  DWARFContext.cpp
+  DWARFDebugAbbrev.cpp
+  DWARFDebugArangeSet.cpp
+  DWARFDebugAranges.cpp
+  DWARFDebugInfoEntry.cpp
+  DWARFDebugLine.cpp
+  DWARFFormValue.cpp
+  )
+
+add_llvm_library_dependencies(LLVMDebugInfo
+  LLVMSupport
+  )
diff --git a/lib/DebugInfo/DIContext.cpp b/lib/DebugInfo/DIContext.cpp
new file mode 100644
index 0000000..e2fd55f
--- /dev/null
+++ b/lib/DebugInfo/DIContext.cpp
@@ -0,0 +1,24 @@
+//===-- DIContext.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/DIContext.h"
+#include "DWARFContext.h"
+using namespace llvm;
+
+DIContext::~DIContext() {}
+
+DIContext *DIContext::getDWARFContext(bool isLittleEndian,
+                                      StringRef infoSection,
+                                      StringRef abbrevSection,
+                                      StringRef aRangeSection,
+                                      StringRef lineSection,
+                                      StringRef stringSection) {
+  return new DWARFContextInMemory(isLittleEndian, infoSection, abbrevSection,
+                                  aRangeSection, lineSection, stringSection);
+}
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
new file mode 100644
index 0000000..0df692c
--- /dev/null
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
@@ -0,0 +1,83 @@
+//===-- DWARFAbbreviationDeclaration.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFAbbreviationDeclaration.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr){
+  return extract(data, offset_ptr, data.getULEB128(offset_ptr));
+}
+
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr,
+                                      uint32_t code) {
+  Code = code;
+  Attributes.clear();
+  if (Code) {
+    Tag = data.getULEB128(offset_ptr);
+    HasChildren = data.getU8(offset_ptr);
+
+    while (data.isValidOffset(*offset_ptr)) {
+      uint16_t attr = data.getULEB128(offset_ptr);
+      uint16_t form = data.getULEB128(offset_ptr);
+
+      if (attr && form)
+        Attributes.push_back(DWARFAttribute(attr, form));
+      else
+        break;
+    }
+
+    return Tag != 0;
+  } else {
+    Tag = 0;
+    HasChildren = false;
+  }
+
+  return false;
+}
+
+void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
+  const char *tagString = TagString(getTag());
+  OS << '[' << getCode() << "] ";
+  if (tagString)
+    OS << tagString;
+  else
+    OS << format("DW_TAG_Unknown_%x", getTag());
+  OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
+  for (unsigned i = 0, e = Attributes.size(); i != e; ++i) {
+    OS << '\t';
+    const char *attrString = AttributeString(Attributes[i].getAttribute());
+    if (attrString)
+      OS << attrString;
+    else
+      OS << format("DW_AT_Unknown_%x", Attributes[i].getAttribute());
+    OS << '\t';
+    const char *formString = FormEncodingString(Attributes[i].getForm());
+    if (formString)
+      OS << formString;
+    else
+      OS << format("DW_FORM_Unknown_%x", Attributes[i].getForm());
+    OS << '\n';
+  }
+  OS << '\n';
+}
+
+uint32_t
+DWARFAbbreviationDeclaration::findAttributeIndex(uint16_t attr) const {
+  for (uint32_t i = 0, e = Attributes.size(); i != e; ++i) {
+    if (Attributes[i].getAttribute() == attr)
+      return i;
+  }
+  return -1U;
+}
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.h b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
new file mode 100644
index 0000000..2463a3c
--- /dev/null
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
@@ -0,0 +1,54 @@
+//===-- DWARFAbbreviationDeclaration.h --------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+#define LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+
+#include "DWARFAttribute.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataExtractor.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFAbbreviationDeclaration {
+  uint32_t Code;
+  uint32_t Tag;
+  bool HasChildren;
+  SmallVector<DWARFAttribute, 8> Attributes;
+public:
+  enum { InvalidCode = 0 };
+  DWARFAbbreviationDeclaration()
+    : Code(InvalidCode), Tag(0), HasChildren(0) {}
+
+  uint32_t getCode() const { return Code; }
+  uint32_t getTag() const { return Tag; }
+  bool hasChildren() const { return HasChildren; }
+  uint32_t getNumAttributes() const { return Attributes.size(); }
+  uint16_t getAttrByIndex(uint32_t idx) const {
+    return Attributes.size() > idx ? Attributes[idx].getAttribute() : 0;
+  }
+  uint16_t getFormByIndex(uint32_t idx) const {
+    return Attributes.size() > idx ? Attributes[idx].getForm() : 0;
+  }
+
+  uint32_t findAttributeIndex(uint16_t attr) const;
+  bool extract(DataExtractor data, uint32_t* offset_ptr);
+  bool extract(DataExtractor data, uint32_t* offset_ptr, uint32_t code);
+  bool isValid() const { return Code != 0 && Tag != 0; }
+  void dump(raw_ostream &OS) const;
+  const SmallVectorImpl<DWARFAttribute> &getAttributes() const {
+    return Attributes;
+  }
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFAttribute.h b/lib/DebugInfo/DWARFAttribute.h
new file mode 100644
index 0000000..6f49b63
--- /dev/null
+++ b/lib/DebugInfo/DWARFAttribute.h
@@ -0,0 +1,30 @@
+//===-- DWARFAttribute.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFATTRIBUTE_H
+#define LLVM_DEBUGINFO_DWARFATTRIBUTE_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class DWARFAttribute {
+  uint16_t Attribute;
+  uint16_t Form;
+  public:
+  DWARFAttribute(uint16_t attr, uint16_t form)
+    : Attribute(attr), Form(form) {}
+
+  uint16_t getAttribute() const { return Attribute; }
+  uint16_t getForm() const { return Form; }
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
new file mode 100644
index 0000000..24bf97f
--- /dev/null
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -0,0 +1,238 @@
+//===-- DWARFCompileUnit.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+DataExtractor DWARFCompileUnit::getDebugInfoExtractor() const {
+  return DataExtractor(Context.getInfoSection(),
+                       Context.isLittleEndian(), getAddressByteSize());
+}
+
+bool DWARFCompileUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
+  clear();
+
+  Offset = *offset_ptr;
+
+  if (debug_info.isValidOffset(*offset_ptr)) {
+    uint64_t abbrOffset;
+    const DWARFDebugAbbrev *abbr = Context.getDebugAbbrev();
+    Length = debug_info.getU32(offset_ptr);
+    Version = debug_info.getU16(offset_ptr);
+    abbrOffset = debug_info.getU32(offset_ptr);
+    AddrSize = debug_info.getU8(offset_ptr);
+
+    bool lengthOK = debug_info.isValidOffset(getNextCompileUnitOffset()-1);
+    bool versionOK = DWARFContext::isSupportedVersion(Version);
+    bool abbrOffsetOK = Context.getAbbrevSection().size() > abbrOffset;
+    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+    if (lengthOK && versionOK && addrSizeOK && abbrOffsetOK && abbr != NULL) {
+      Abbrevs = abbr->getAbbreviationDeclarationSet(abbrOffset);
+      return true;
+    }
+
+    // reset the offset to where we tried to parse from if anything went wrong
+    *offset_ptr = Offset;
+  }
+
+  return false;
+}
+
+uint32_t
+DWARFCompileUnit::extract(uint32_t offset, DataExtractor debug_info_data,
+                          const DWARFAbbreviationDeclarationSet *abbrevs) {
+  clear();
+
+  Offset = offset;
+
+  if (debug_info_data.isValidOffset(offset)) {
+    Length = debug_info_data.getU32(&offset);
+    Version = debug_info_data.getU16(&offset);
+    bool abbrevsOK = debug_info_data.getU32(&offset) == abbrevs->getOffset();
+    Abbrevs = abbrevs;
+    AddrSize = debug_info_data.getU8 (&offset);
+
+    bool versionOK = DWARFContext::isSupportedVersion(Version);
+    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+    if (versionOK && addrSizeOK && abbrevsOK &&
+        debug_info_data.isValidOffset(offset))
+      return offset;
+  }
+  return 0;
+}
+
+void DWARFCompileUnit::clear() {
+  Offset = 0;
+  Length = 0;
+  Version = 0;
+  Abbrevs = 0;
+  AddrSize = 0;
+  BaseAddr = 0;
+  DieArray.clear();
+}
+
+void DWARFCompileUnit::dump(raw_ostream &OS) {
+  OS << format("0x%08x", Offset) << ": Compile Unit:"
+     << " length = " << format("0x%08x", Length)
+     << " version = " << format("0x%04x", Version)
+     << " abbr_offset = " << format("0x%04x", Abbrevs->getOffset())
+     << " addr_size = " << format("0x%02x", AddrSize)
+     << " (next CU at " << format("0x%08x", getNextCompileUnitOffset())
+     << ")\n";
+
+  getCompileUnitDIE(false)->dump(OS, this, -1U);
+}
+
+void DWARFCompileUnit::setDIERelations() {
+  if (DieArray.empty())
+    return;
+  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
+  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
+  DWARFDebugInfoEntryMinimal *curr_die;
+  // We purposely are skipping the last element in the array in the loop below
+  // so that we can always have a valid next item
+  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
+    // Since our loop doesn't include the last element, we can always
+    // safely access the next die in the array.
+    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
+
+    const DWARFAbbreviationDeclaration *curr_die_abbrev =
+      curr_die->getAbbreviationDeclarationPtr();
+
+    if (curr_die_abbrev) {
+      // Normal DIE
+      if (curr_die_abbrev->hasChildren())
+        next_die->setParent(curr_die);
+      else
+        curr_die->setSibling(next_die);
+    } else {
+      // NULL DIE that terminates a sibling chain
+      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
+      if (parent)
+        parent->setSibling(next_die);
+    }
+  }
+
+  // Since we skipped the last element, we need to fix it up!
+  if (die_array_begin < die_array_end)
+    curr_die->setParent(die_array_begin);
+}
+
+size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) {
+  const size_t initial_die_array_size = DieArray.size();
+  if ((cu_die_only && initial_die_array_size > 0) ||
+      initial_die_array_size > 1)
+    return 0; // Already parsed
+
+  // Set the offset to that of the first DIE and calculate the start of the
+  // next compilation unit header.
+  uint32_t offset = getFirstDIEOffset();
+  uint32_t next_cu_offset = getNextCompileUnitOffset();
+
+  DWARFDebugInfoEntryMinimal die;
+  // Keep a flat array of the DIE for binary lookup by DIE offset
+  uint32_t depth = 0;
+  // We are in our compile unit, parse starting at the offset
+  // we were told to parse
+
+  const uint8_t *fixed_form_sizes =
+    DWARFFormValue::getFixedFormSizesForAddressSize(getAddressByteSize());
+
+  while (offset < next_cu_offset &&
+         die.extractFast(this, fixed_form_sizes, &offset)) {
+
+    if (depth == 0) {
+      uint64_t base_addr =
+        die.getAttributeValueAsUnsigned(this, DW_AT_low_pc, -1U);
+      if (base_addr == -1U)
+        base_addr = die.getAttributeValueAsUnsigned(this, DW_AT_entry_pc, 0);
+      setBaseAddress(base_addr);
+    }
+
+    if (cu_die_only) {
+      addDIE(die);
+      return 1;
+    }
+    else if (depth == 0 && initial_die_array_size == 1) {
+      // Don't append the CU die as we already did that
+    } else {
+      addDIE (die);
+    }
+
+    const DWARFAbbreviationDeclaration *abbrDecl =
+      die.getAbbreviationDeclarationPtr();
+    if (abbrDecl) {
+      // Normal DIE
+      if (abbrDecl->hasChildren())
+        ++depth;
+    } else {
+      // NULL DIE.
+      if (depth > 0)
+        --depth;
+      if (depth == 0)
+        break;  // We are done with this compile unit!
+    }
+
+  }
+
+  // Give a little bit of info if we encounter corrupt DWARF (our offset
+  // should always terminate at or before the start of the next compilation
+  // unit header).
+  if (offset > next_cu_offset) {
+    fprintf (stderr, "warning: DWARF compile unit extends beyond its bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), offset);
+  }
+
+  setDIERelations();
+  return DieArray.size();
+}
+
+void DWARFCompileUnit::clearDIEs(bool keep_compile_unit_die) {
+  if (DieArray.size() > 1) {
+    // std::vectors never get any smaller when resized to a smaller size,
+    // or when clear() or erase() are called, the size will report that it
+    // is smaller, but the memory allocated remains intact (call capacity()
+    // to see this). So we need to create a temporary vector and swap the
+    // contents which will cause just the internal pointers to be swapped
+    // so that when "tmp_array" goes out of scope, it will destroy the
+    // contents.
+
+    // Save at least the compile unit DIE
+    std::vector<DWARFDebugInfoEntryMinimal> tmpArray;
+    DieArray.swap(tmpArray);
+    if (keep_compile_unit_die)
+      DieArray.push_back(tmpArray.front());
+  }
+}
+
+void
+DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                                         bool clear_dies_if_already_not_parsed){
+  // This function is usually called if there in no .debug_aranges section
+  // in order to produce a compile unit level set of address ranges that
+  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
+  // all compile units to stay loaded when they weren't needed. So we can end
+  // up parsing the DWARF and then throwing them all away to keep memory usage
+  // down.
+  const bool clear_dies = extractDIEsIfNeeded(false) > 1;
+
+  DieArray[0].buildAddressRangeTable(this, debug_aranges);
+
+  // Keep memory down by clearing DIEs if this generate function
+  // caused them to be parsed.
+  if (clear_dies)
+    clearDIEs(true);
+}
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
new file mode 100644
index 0000000..d916729
--- /dev/null
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -0,0 +1,111 @@
+//===-- DWARFCompileUnit.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
+#define LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
+
+#include "DWARFDebugAbbrev.h"
+#include "DWARFDebugInfoEntry.h"
+#include <vector>
+
+namespace llvm {
+
+class DWARFContext;
+class raw_ostream;
+
+class DWARFCompileUnit {
+  DWARFContext &Context;
+
+  uint32_t Offset;
+  uint32_t Length;
+  uint16_t Version;
+  const DWARFAbbreviationDeclarationSet *Abbrevs;
+  uint8_t AddrSize;
+  uint64_t BaseAddr;
+  // The compile unit debug information entry item.
+  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+public:
+  DWARFCompileUnit(DWARFContext &context) : Context(context) {
+    clear();
+  }
+
+  DWARFContext &getContext() const { return Context; }
+  DataExtractor getDebugInfoExtractor() const;
+
+  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
+  uint32_t extract(uint32_t offset, DataExtractor debug_info_data,
+                   const DWARFAbbreviationDeclarationSet *abbrevs);
+
+  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
+  /// hasn't already been done.
+  size_t extractDIEsIfNeeded(bool cu_die_only);
+  void clear();
+  void dump(raw_ostream &OS);
+  uint32_t getOffset() const { return Offset; }
+  /// Size in bytes of the compile unit header.
+  uint32_t getSize() const { return 11; }
+  bool containsDIEOffset(uint32_t die_offset) const {
+    return die_offset >= getFirstDIEOffset() &&
+      die_offset < getNextCompileUnitOffset();
+  }
+  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
+  uint32_t getNextCompileUnitOffset() const { return Offset + Length + 4; }
+  /// Size in bytes of the .debug_info data associated with this compile unit.
+  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
+  uint32_t getLength() const { return Length; }
+  uint16_t getVersion() const { return Version; }
+  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
+    return Abbrevs;
+  }
+  uint8_t getAddressByteSize() const { return AddrSize; }
+  uint64_t getBaseAddress() const { return BaseAddr; }
+
+  void setBaseAddress(uint64_t base_addr) {
+    BaseAddr = base_addr;
+  }
+
+  const DWARFDebugInfoEntryMinimal *
+  getCompileUnitDIE(bool extract_cu_die_only = true) {
+    extractDIEsIfNeeded(extract_cu_die_only);
+    if (DieArray.empty())
+      return NULL;
+    return &DieArray[0];
+  }
+
+  /// setDIERelations - We read in all of the DIE entries into our flat list
+  /// of DIE entries and now we need to go back through all of them and set the
+  /// parent, sibling and child pointers for quick DIE navigation.
+  void setDIERelations();
+
+  void addDIE(DWARFDebugInfoEntryMinimal &die) {
+    // The average bytes per DIE entry has been seen to be
+    // around 14-20 so lets pre-reserve the needed memory for
+    // our DIE entries accordingly. Search forward for "Compute
+    // average bytes per DIE" to see #if'ed out code that does
+    // that determination.
+
+    // Only reserve the memory if we are adding children of
+    // the main compile unit DIE. The compile unit DIE is always
+    // the first entry, so if our size is 1, then we are adding
+    // the first compile unit child DIE and should reserve
+    // the memory.
+    if (DieArray.empty())
+      DieArray.reserve(getDebugInfoSize() / 14);
+    DieArray.push_back(die);
+  }
+
+  void clearDIEs(bool keep_compile_unit_die);
+
+  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                              bool clear_dies_if_already_not_parsed);
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
new file mode 100644
index 0000000..e1ac398
--- /dev/null
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -0,0 +1,167 @@
+//===-- DWARFContext.cpp --------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFContext.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+using namespace dwarf;
+
+void DWARFContext::dump(raw_ostream &OS) {
+  OS << ".debug_abbrev contents:\n";
+  getDebugAbbrev()->dump(OS);
+
+  OS << "\n.debug_info contents:\n";
+  for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i)
+    getCompileUnitAtIndex(i)->dump(OS);
+
+  OS << "\n.debug_aranges contents:\n";
+  DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
+  uint32_t offset = 0;
+  DWARFDebugArangeSet set;
+  while (set.extract(arangesData, &offset))
+    set.dump(OS);
+
+  OS << "\n.debug_lines contents:\n";
+  for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i) {
+    DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
+    unsigned stmtOffset =
+      cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
+                                                           -1U);
+    if (stmtOffset != -1U) {
+      DataExtractor lineData(getLineSection(), isLittleEndian(),
+                             cu->getAddressByteSize());
+      DWARFDebugLine::DumpingState state(OS);
+      DWARFDebugLine::parseStatementTable(lineData, &stmtOffset, state);
+    }
+  }
+
+  OS << "\n.debug_str contents:\n";
+  DataExtractor strData(getStringSection(), isLittleEndian(), 0);
+  offset = 0;
+  uint32_t lastOffset = 0;
+  while (const char *s = strData.getCStr(&offset)) {
+    OS << format("0x%8.8x: \"%s\"\n", lastOffset, s);
+    lastOffset = offset;
+  }
+}
+
+const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
+  if (Abbrev)
+    return Abbrev.get();
+
+  DataExtractor abbrData(getAbbrevSection(), isLittleEndian(), 0);
+
+  Abbrev.reset(new DWARFDebugAbbrev());
+  Abbrev->parse(abbrData);
+  return Abbrev.get();
+}
+
+const DWARFDebugAranges *DWARFContext::getDebugAranges() {
+  if (Aranges)
+    return Aranges.get();
+
+  DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
+
+  Aranges.reset(new DWARFDebugAranges());
+  Aranges->extract(arangesData);
+  if (Aranges->isEmpty()) // No aranges in file, generate them from the DIEs.
+    Aranges->generate(this);
+  return Aranges.get();
+}
+
+const DWARFDebugLine::LineTable *
+DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
+  if (!Line)
+    Line.reset(new DWARFDebugLine());
+
+  unsigned stmtOffset =
+    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
+                                                         -1U);
+  if (stmtOffset == -1U)
+    return 0; // No line table for this compile unit.
+
+  // See if the line table is cached.
+  if (const DWARFDebugLine::LineTable *lt = Line->getLineTable(stmtOffset))
+    return lt;
+
+  // We have to parse it first.
+  DataExtractor lineData(getLineSection(), isLittleEndian(),
+                         cu->getAddressByteSize());
+  return Line->getOrParseLineTable(lineData, stmtOffset);
+}
+
+void DWARFContext::parseCompileUnits() {
+  uint32_t offset = 0;
+  const DataExtractor &debug_info_data = DataExtractor(getInfoSection(),
+                                                       isLittleEndian(), 0);
+  while (debug_info_data.isValidOffset(offset)) {
+    CUs.push_back(DWARFCompileUnit(*this));
+    if (!CUs.back().extract(debug_info_data, &offset)) {
+      CUs.pop_back();
+      break;
+    }
+
+    offset = CUs.back().getNextCompileUnitOffset();
+  }
+}
+
+namespace {
+  struct OffsetComparator {
+    bool operator()(const DWARFCompileUnit &LHS,
+                    const DWARFCompileUnit &RHS) const {
+      return LHS.getOffset() < RHS.getOffset();
+    }
+    bool operator()(const DWARFCompileUnit &LHS, uint32_t RHS) const {
+      return LHS.getOffset() < RHS;
+    }
+    bool operator()(uint32_t LHS, const DWARFCompileUnit &RHS) const {
+      return LHS < RHS.getOffset();
+    }
+  };
+}
+
+DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t offset) {
+  if (CUs.empty())
+    parseCompileUnits();
+
+  DWARFCompileUnit *i = std::lower_bound(CUs.begin(), CUs.end(), offset,
+                                         OffsetComparator());
+  if (i != CUs.end())
+    return &*i;
+  return 0;
+}
+
+DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address) {
+  // First, get the offset of the compile unit.
+  uint32_t cuOffset = getDebugAranges()->findAddress(address);
+  // Retrieve the compile unit.
+  DWARFCompileUnit *cu = getCompileUnitForOffset(cuOffset);
+  if (!cu)
+    return DILineInfo("<invalid>", 0, 0);
+  // Get the line table for this compile unit.
+  const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu);
+  if (!lineTable)
+    return DILineInfo("<invalid>", 0, 0);
+  // Get the index of the row we're looking for in the line table.
+  uint64_t hiPC =
+    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_high_pc,
+                                                         -1ULL);
+  uint32_t rowIndex = lineTable->lookupAddress(address, hiPC);
+  if (rowIndex == -1U)
+    return DILineInfo("<invalid>", 0, 0);
+
+  // From here, contruct the DILineInfo.
+  const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex];
+  const std::string &fileName = lineTable->Prologue.FileNames[row.File-1].Name;
+
+  return DILineInfo(fileName.c_str(), row.Line, row.Column);
+}
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
new file mode 100644
index 0000000..746a463
--- /dev/null
+++ b/lib/DebugInfo/DWARFContext.h
@@ -0,0 +1,118 @@
+//===-- DWARFContext.h ------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===/
+
+#ifndef LLVM_DEBUGINFO_DWARFCONTEXT_H
+#define LLVM_DEBUGINFO_DWARFCONTEXT_H
+
+#include "DWARFCompileUnit.h"
+#include "DWARFDebugAranges.h"
+#include "DWARFDebugLine.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+/// DWARFContext
+/// This data structure is the top level entity that deals with dwarf debug
+/// information parsing. The actual data is supplied through pure virtual
+/// methods that a concrete implementation provides.
+class DWARFContext : public DIContext {
+  bool IsLittleEndian;
+
+  SmallVector<DWARFCompileUnit, 1> CUs;
+  OwningPtr<DWARFDebugAbbrev> Abbrev;
+  OwningPtr<DWARFDebugAranges> Aranges;
+  OwningPtr<DWARFDebugLine> Line;
+
+  DWARFContext(DWARFContext &); // = delete
+  DWARFContext &operator=(DWARFContext &); // = delete
+
+  /// Read compile units from the debug_info section and store them in CUs.
+  void parseCompileUnits();
+protected:
+  DWARFContext(bool isLittleEndian) : IsLittleEndian(isLittleEndian) {}
+public:
+  virtual void dump(raw_ostream &OS);
+  /// Get the number of compile units in this context.
+  unsigned getNumCompileUnits() {
+    if (CUs.empty())
+      parseCompileUnits();
+    return CUs.size();
+  }
+  /// Get the compile unit at the specified index for this compile unit.
+  DWARFCompileUnit *getCompileUnitAtIndex(unsigned index) {
+    if (CUs.empty())
+      parseCompileUnits();
+    return &CUs[index];
+  }
+
+  /// Return the compile unit that includes an offset (relative to .debug_info).
+  DWARFCompileUnit *getCompileUnitForOffset(uint32_t offset);
+
+  /// Get a pointer to the parsed DebugAbbrev object.
+  const DWARFDebugAbbrev *getDebugAbbrev();
+
+  /// Get a pointer to the parsed DebugAranges object.
+  const DWARFDebugAranges *getDebugAranges();
+
+  /// Get a pointer to a parsed line table corresponding to a compile unit.
+  const DWARFDebugLine::LineTable *
+  getLineTableForCompileUnit(DWARFCompileUnit *cu);
+
+  virtual DILineInfo getLineInfoForAddress(uint64_t address);
+
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  virtual StringRef getInfoSection() = 0;
+  virtual StringRef getAbbrevSection() = 0;
+  virtual StringRef getARangeSection() = 0;
+  virtual StringRef getLineSection() = 0;
+  virtual StringRef getStringSection() = 0;
+
+  static bool isSupportedVersion(unsigned version) {
+    return version == 2 || version == 3;
+  }
+};
+
+
+/// DWARFContextInMemory is the simplest possible implementation of a
+/// DWARFContext. It assumes all content is available in memory and stores
+/// pointers to it.
+class DWARFContextInMemory : public DWARFContext {
+  StringRef InfoSection;
+  StringRef AbbrevSection;
+  StringRef ARangeSection;
+  StringRef LineSection;
+  StringRef StringSection;
+public:
+  DWARFContextInMemory(bool isLittleEndian,
+                       StringRef infoSection,
+                       StringRef abbrevSection,
+                       StringRef aRangeSection,
+                       StringRef lineSection,
+                       StringRef stringSection)
+    : DWARFContext(isLittleEndian),
+      InfoSection(infoSection),
+      AbbrevSection(abbrevSection),
+      ARangeSection(aRangeSection),
+      LineSection(lineSection),
+      StringSection(stringSection)
+    {}
+
+  virtual StringRef getInfoSection() { return InfoSection; }
+  virtual StringRef getAbbrevSection() { return AbbrevSection; }
+  virtual StringRef getARangeSection() { return ARangeSection; }
+  virtual StringRef getLineSection() { return LineSection; }
+  virtual StringRef getStringSection() { return StringSection; }
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARFDebugAbbrev.cpp
new file mode 100644
index 0000000..a11ae3f
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugAbbrev.cpp
@@ -0,0 +1,106 @@
+//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugAbbrev.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+bool DWARFAbbreviationDeclarationSet::extract(DataExtractor data,
+                                              uint32_t* offset_ptr) {
+  const uint32_t beginOffset = *offset_ptr;
+  Offset = beginOffset;
+  clear();
+  DWARFAbbreviationDeclaration abbrevDeclaration;
+  uint32_t prevAbbrAode = 0;
+  while (abbrevDeclaration.extract(data, offset_ptr)) {
+    Decls.push_back(abbrevDeclaration);
+    if (IdxOffset == 0) {
+      IdxOffset = abbrevDeclaration.getCode();
+    } else {
+      if (prevAbbrAode + 1 != abbrevDeclaration.getCode())
+        IdxOffset = UINT32_MAX;// Out of order indexes, we can't do O(1) lookups
+    }
+    prevAbbrAode = abbrevDeclaration.getCode();
+  }
+  return beginOffset != *offset_ptr;
+}
+
+void DWARFAbbreviationDeclarationSet::dump(raw_ostream &OS) const {
+  for (unsigned i = 0, e = Decls.size(); i != e; ++i)
+    Decls[i].dump(OS);
+}
+
+const DWARFAbbreviationDeclaration*
+DWARFAbbreviationDeclarationSet::getAbbreviationDeclaration(uint32_t abbrCode)
+  const {
+  if (IdxOffset == UINT32_MAX) {
+    DWARFAbbreviationDeclarationCollConstIter pos;
+    DWARFAbbreviationDeclarationCollConstIter end = Decls.end();
+    for (pos = Decls.begin(); pos != end; ++pos) {
+      if (pos->getCode() == abbrCode)
+        return &(*pos);
+    }
+  } else {
+    uint32_t idx = abbrCode - IdxOffset;
+    if (idx < Decls.size())
+      return &Decls[idx];
+  }
+  return NULL;
+}
+
+DWARFDebugAbbrev::DWARFDebugAbbrev() :
+  AbbrevCollMap(),
+  PrevAbbrOffsetPos(AbbrevCollMap.end()) {}
+
+
+void DWARFDebugAbbrev::parse(DataExtractor data) {
+  uint32_t offset = 0;
+
+  while (data.isValidOffset(offset)) {
+    uint32_t initial_cu_offset = offset;
+    DWARFAbbreviationDeclarationSet abbrevDeclSet;
+
+    if (abbrevDeclSet.extract(data, &offset))
+      AbbrevCollMap[initial_cu_offset] = abbrevDeclSet;
+    else
+      break;
+  }
+  PrevAbbrOffsetPos = AbbrevCollMap.end();
+}
+
+void DWARFDebugAbbrev::dump(raw_ostream &OS) const {
+  if (AbbrevCollMap.empty()) {
+    OS << "< EMPTY >\n";
+    return;
+  }
+
+  DWARFAbbreviationDeclarationCollMapConstIter pos;
+  for (pos = AbbrevCollMap.begin(); pos != AbbrevCollMap.end(); ++pos) {
+    OS << format("Abbrev table for offset: 0x%8.8x\n", pos->first);
+    pos->second.dump(OS);
+  }
+}
+
+const DWARFAbbreviationDeclarationSet*
+DWARFDebugAbbrev::getAbbreviationDeclarationSet(uint64_t cu_abbr_offset) const {
+  DWARFAbbreviationDeclarationCollMapConstIter end = AbbrevCollMap.end();
+  DWARFAbbreviationDeclarationCollMapConstIter pos;
+  if (PrevAbbrOffsetPos != end &&
+      PrevAbbrOffsetPos->first == cu_abbr_offset) {
+    return &(PrevAbbrOffsetPos->second);
+  } else {
+    pos = AbbrevCollMap.find(cu_abbr_offset);
+    PrevAbbrOffsetPos = pos;
+  }
+
+  if (pos != AbbrevCollMap.end())
+    return &(pos->second);
+  return NULL;
+}
diff --git a/lib/DebugInfo/DWARFDebugAbbrev.h b/lib/DebugInfo/DWARFDebugAbbrev.h
new file mode 100644
index 0000000..03189b1
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugAbbrev.h
@@ -0,0 +1,73 @@
+//===-- DWARFDebugAbbrev.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
+#define LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
+
+#include "DWARFAbbreviationDeclaration.h"
+#include <list>
+#include <map>
+#include <vector>
+
+namespace llvm {
+
+typedef std::vector<DWARFAbbreviationDeclaration>
+  DWARFAbbreviationDeclarationColl;
+typedef DWARFAbbreviationDeclarationColl::iterator
+  DWARFAbbreviationDeclarationCollIter;
+typedef DWARFAbbreviationDeclarationColl::const_iterator
+  DWARFAbbreviationDeclarationCollConstIter;
+
+class DWARFAbbreviationDeclarationSet {
+  uint64_t Offset;
+  uint32_t IdxOffset;
+  std::vector<DWARFAbbreviationDeclaration> Decls;
+  public:
+  DWARFAbbreviationDeclarationSet()
+    : Offset(0), IdxOffset(0) {}
+
+  DWARFAbbreviationDeclarationSet(uint64_t offset, uint32_t idxOffset)
+    : Offset(offset), IdxOffset(idxOffset) {}
+
+  void clear() {
+    IdxOffset = 0;
+    Decls.clear();
+  }
+  uint64_t getOffset() const { return Offset; }
+  void dump(raw_ostream &OS) const;
+  bool extract(DataExtractor data, uint32_t* offset_ptr);
+
+  const DWARFAbbreviationDeclaration *
+    getAbbreviationDeclaration(uint32_t abbrCode) const;
+};
+
+class DWARFDebugAbbrev {
+public:
+  typedef std::map<uint64_t, DWARFAbbreviationDeclarationSet>
+    DWARFAbbreviationDeclarationCollMap;
+  typedef DWARFAbbreviationDeclarationCollMap::iterator
+    DWARFAbbreviationDeclarationCollMapIter;
+  typedef DWARFAbbreviationDeclarationCollMap::const_iterator
+    DWARFAbbreviationDeclarationCollMapConstIter;
+
+private:
+  DWARFAbbreviationDeclarationCollMap AbbrevCollMap;
+  mutable DWARFAbbreviationDeclarationCollMapConstIter PrevAbbrOffsetPos;
+
+public:
+  DWARFDebugAbbrev();
+  const DWARFAbbreviationDeclarationSet *
+    getAbbreviationDeclarationSet(uint64_t cu_abbr_offset) const;
+  void dump(raw_ostream &OS) const;
+  void parse(DataExtractor data);
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARFDebugArangeSet.cpp
new file mode 100644
index 0000000..b0c0354
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugArangeSet.cpp
@@ -0,0 +1,150 @@
+//===-- DWARFDebugArangeSet.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugArangeSet.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+using namespace llvm;
+
+void DWARFDebugArangeSet::clear() {
+  Offset = -1U;
+  std::memset(&Header, 0, sizeof(Header));
+  ArangeDescriptors.clear();
+}
+
+void DWARFDebugArangeSet::compact() {
+  if (ArangeDescriptors.empty())
+    return;
+
+  // Iterate through all arange descriptors and combine any ranges that
+  // overlap or have matching boundaries. The ArangeDescriptors are assumed
+  // to be in ascending order.
+  uint32_t i = 0;
+  while (i + 1 < ArangeDescriptors.size()) {
+    if (ArangeDescriptors[i].getEndAddress() >= ArangeDescriptors[i+1].Address){
+      // The current range ends at or exceeds the start of the next address
+      // range. Compute the max end address between the two and use that to
+      // make the new length.
+      const uint64_t max_end_addr =
+        std::max(ArangeDescriptors[i].getEndAddress(),
+                 ArangeDescriptors[i+1].getEndAddress());
+      ArangeDescriptors[i].Length = max_end_addr - ArangeDescriptors[i].Address;
+      // Now remove the next entry as it was just combined with the previous one
+      ArangeDescriptors.erase(ArangeDescriptors.begin()+i+1);
+    } else {
+      // Discontiguous address range, just proceed to the next one.
+      ++i;
+    }
+  }
+}
+
+bool
+DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
+  if (data.isValidOffset(*offset_ptr)) {
+    ArangeDescriptors.clear();
+    Offset = *offset_ptr;
+
+    // 7.20 Address Range Table
+    //
+    // Each set of entries in the table of address ranges contained in
+    // the .debug_aranges section begins with a header consisting of: a
+    // 4-byte length containing the length of the set of entries for this
+    // compilation unit, not including the length field itself; a 2-byte
+    // version identifier containing the value 2 for DWARF Version 2; a
+    // 4-byte offset into the.debug_infosection; a 1-byte unsigned integer
+    // containing the size in bytes of an address (or the offset portion of
+    // an address for segmented addressing) on the target system; and a
+    // 1-byte unsigned integer containing the size in bytes of a segment
+    // descriptor on the target system. This header is followed by a series
+    // of tuples. Each tuple consists of an address and a length, each in
+    // the size appropriate for an address on the target architecture.
+    Header.Length = data.getU32(offset_ptr);
+    Header.Version = data.getU16(offset_ptr);
+    Header.CuOffset = data.getU32(offset_ptr);
+    Header.AddrSize = data.getU8(offset_ptr);
+    Header.SegSize = data.getU8(offset_ptr);
+
+    // Perform basic validation of the header fields.
+    if (!data.isValidOffsetForDataOfSize(Offset, Header.Length) ||
+        (Header.AddrSize != 4 && Header.AddrSize != 8)) {
+      clear();
+      return false;
+    }
+
+    // The first tuple following the header in each set begins at an offset
+    // that is a multiple of the size of a single tuple (that is, twice the
+    // size of an address). The header is padded, if necessary, to the
+    // appropriate boundary.
+    const uint32_t header_size = *offset_ptr - Offset;
+    const uint32_t tuple_size = Header.AddrSize * 2;
+    uint32_t first_tuple_offset = 0;
+    while (first_tuple_offset < header_size)
+      first_tuple_offset += tuple_size;
+
+    *offset_ptr = Offset + first_tuple_offset;
+
+    Descriptor arangeDescriptor;
+
+    assert(sizeof(arangeDescriptor.Address) == sizeof(arangeDescriptor.Length));
+    assert(sizeof(arangeDescriptor.Address) >= Header.AddrSize);
+
+    while (data.isValidOffset(*offset_ptr)) {
+      arangeDescriptor.Address = data.getUnsigned(offset_ptr, Header.AddrSize);
+      arangeDescriptor.Length = data.getUnsigned(offset_ptr, Header.AddrSize);
+
+      // Each set of tuples is terminated by a 0 for the address and 0
+      // for the length.
+      if (arangeDescriptor.Address || arangeDescriptor.Length)
+        ArangeDescriptors.push_back(arangeDescriptor);
+      else
+        break; // We are done if we get a zero address and length
+    }
+
+    return !ArangeDescriptors.empty();
+  }
+  return false;
+}
+
+void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
+  OS << format("Address Range Header: length = 0x%8.8x, version = 0x%4.4x, ",
+               Header.Length, Header.Version)
+     << format("cu_offset = 0x%8.8x, addr_size = 0x%2.2x, seg_size = 0x%2.2x\n",
+               Header.CuOffset, Header.AddrSize, Header.SegSize);
+
+  const uint32_t hex_width = Header.AddrSize * 2;
+  for (DescriptorConstIter pos = ArangeDescriptors.begin(),
+       end = ArangeDescriptors.end(); pos != end; ++pos)
+    OS << format("[0x%*.*llx -", hex_width, hex_width, pos->Address)
+       << format(" 0x%*.*llx)\n", hex_width, hex_width, pos->getEndAddress());
+}
+
+
+namespace {
+  class DescriptorContainsAddress {
+    const uint64_t Address;
+  public:
+    DescriptorContainsAddress(uint64_t address) : Address(address) {}
+    bool operator()(const DWARFDebugArangeSet::Descriptor &desc) const {
+      return Address >= desc.Address && Address < (desc.Address + desc.Length);
+    }
+  };
+}
+
+uint32_t DWARFDebugArangeSet::findAddress(uint64_t address) const {
+  DescriptorConstIter end = ArangeDescriptors.end();
+  DescriptorConstIter pos =
+    std::find_if(ArangeDescriptors.begin(), end, // Range
+                 DescriptorContainsAddress(address)); // Predicate
+  if (pos != end)
+    return Header.CuOffset;
+
+  return -1U;
+}
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.h b/lib/DebugInfo/DWARFDebugArangeSet.h
new file mode 100644
index 0000000..9a2a6d0
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugArangeSet.h
@@ -0,0 +1,75 @@
+//===-- DWARFDebugArangeSet.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
+#define LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
+
+#include "llvm/Support/DataExtractor.h"
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugArangeSet {
+public:
+  struct Header {
+    // The total length of the entries for that set, not including the length
+    // field itself.
+    uint32_t Length;
+    // The offset from the beginning of the .debug_info section of the
+    // compilation unit entry referenced by the table.
+    uint32_t CuOffset;
+    // The DWARF version number.
+    uint16_t Version;
+    // The size in bytes of an address on the target architecture. For segmented
+    // addressing, this is the size of the offset portion of the address.
+    uint8_t AddrSize;
+    // The size in bytes of a segment descriptor on the target architecture.
+    // If the target system uses a flat address space, this value is 0.
+    uint8_t SegSize;
+  };
+
+  struct Descriptor {
+    uint64_t Address;
+    uint64_t Length;
+    uint64_t getEndAddress() const { return Address + Length; }
+  };
+
+private:
+  typedef std::vector<Descriptor> DescriptorColl;
+  typedef DescriptorColl::iterator DescriptorIter;
+  typedef DescriptorColl::const_iterator DescriptorConstIter;
+
+  uint32_t Offset;
+  Header Header;
+  DescriptorColl ArangeDescriptors;
+
+public:
+  DWARFDebugArangeSet() { clear(); }
+  void clear();
+  void compact();
+  bool extract(DataExtractor data, uint32_t *offset_ptr);
+  void dump(raw_ostream &OS) const;
+
+  uint32_t getCompileUnitDIEOffset() const { return Header.CuOffset; }
+  uint32_t getOffsetOfNextEntry() const { return Offset + Header.Length + 4; }
+  uint32_t findAddress(uint64_t address) const;
+  uint32_t getNumDescriptors() const { return ArangeDescriptors.size(); }
+  const struct Header &getHeader() const { return Header; }
+  const Descriptor *getDescriptor(uint32_t i) const {
+    if (i < ArangeDescriptors.size())
+      return &ArangeDescriptors[i];
+    return NULL;
+  }
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
new file mode 100644
index 0000000..576d37d
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -0,0 +1,223 @@
+//===-- DWARFDebugAranges.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugAranges.h"
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+using namespace llvm;
+
+// Compare function DWARFDebugAranges::Range structures
+static bool RangeLessThan(const DWARFDebugAranges::Range &range1,
+                          const DWARFDebugAranges::Range &range2) {
+  return range1.LoPC < range2.LoPC;
+}
+
+namespace {
+  class CountArangeDescriptors {
+  public:
+    CountArangeDescriptors(uint32_t &count_ref) : Count(count_ref) {}
+    void operator()(const DWARFDebugArangeSet &set) {
+      Count += set.getNumDescriptors();
+    }
+    uint32_t &Count;
+  };
+
+  class AddArangeDescriptors {
+  public:
+    AddArangeDescriptors(DWARFDebugAranges::RangeColl &ranges)
+      : RangeCollection(ranges) {}
+    void operator()(const DWARFDebugArangeSet& set) {
+      const DWARFDebugArangeSet::Descriptor* arange_desc_ptr;
+      DWARFDebugAranges::Range range;
+      range.Offset = set.getCompileUnitDIEOffset();
+
+      for (uint32_t i=0; (arange_desc_ptr = set.getDescriptor(i)) != NULL; ++i){
+        range.LoPC = arange_desc_ptr->Address;
+        range.Length = arange_desc_ptr->Length;
+
+        // Insert each item in increasing address order so binary searching
+        // can later be done!
+        DWARFDebugAranges::RangeColl::iterator insert_pos =
+          std::lower_bound(RangeCollection.begin(), RangeCollection.end(),
+                           range, RangeLessThan);
+        RangeCollection.insert(insert_pos, range);
+      }
+    }
+    DWARFDebugAranges::RangeColl& RangeCollection;
+  };
+}
+
+bool DWARFDebugAranges::extract(DataExtractor debug_aranges_data) {
+  if (debug_aranges_data.isValidOffset(0)) {
+    uint32_t offset = 0;
+
+    typedef std::vector<DWARFDebugArangeSet> SetCollection;
+    typedef SetCollection::const_iterator SetCollectionIter;
+    SetCollection sets;
+
+    DWARFDebugArangeSet set;
+    Range range;
+    while (set.extract(debug_aranges_data, &offset))
+      sets.push_back(set);
+
+    uint32_t count = 0;
+
+    std::for_each(sets.begin(), sets.end(), CountArangeDescriptors(count));
+
+    if (count > 0) {
+      Aranges.reserve(count);
+      AddArangeDescriptors range_adder(Aranges);
+      std::for_each(sets.begin(), sets.end(), range_adder);
+    }
+  }
+  return false;
+}
+
+bool DWARFDebugAranges::generate(DWARFContext *ctx) {
+  clear();
+  if (ctx) {
+    const uint32_t num_compile_units = ctx->getNumCompileUnits();
+    for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx) {
+      DWARFCompileUnit *cu = ctx->getCompileUnitAtIndex(cu_idx);
+      if (cu)
+        cu->buildAddressRangeTable(this, true);
+    }
+  }
+  return !isEmpty();
+}
+
+void DWARFDebugAranges::dump(raw_ostream &OS) const {
+  const uint32_t num_ranges = getNumRanges();
+  for (uint32_t i = 0; i < num_ranges; ++i) {
+    const Range &range = Aranges[i];
+    OS << format("0x%8.8x: [0x%8.8llx - 0x%8.8llx)\n", range.Offset,
+                 (uint64_t)range.LoPC, (uint64_t)range.HiPC());
+  }
+}
+
+void DWARFDebugAranges::Range::dump(raw_ostream &OS) const {
+  OS << format("{0x%8.8x}: [0x%8.8llx - 0x%8.8llx)\n", Offset, LoPC, HiPC());
+}
+
+void DWARFDebugAranges::appendRange(uint32_t offset, uint64_t low_pc,
+                                    uint64_t high_pc) {
+  if (!Aranges.empty()) {
+    if (Aranges.back().Offset == offset && Aranges.back().HiPC() == low_pc) {
+      Aranges.back().setHiPC(high_pc);
+      return;
+    }
+  }
+  Aranges.push_back(Range(low_pc, high_pc, offset));
+}
+
+void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
+  const size_t orig_arange_size = Aranges.size();
+  // Size of one? If so, no sorting is needed
+  if (orig_arange_size <= 1)
+    return;
+  // Sort our address range entries
+  std::stable_sort(Aranges.begin(), Aranges.end(), RangeLessThan);
+
+  if (!minimize)
+    return;
+
+  // Most address ranges are contiguous from function to function
+  // so our new ranges will likely be smaller. We calculate the size
+  // of the new ranges since although std::vector objects can be resized,
+  // the will never reduce their allocated block size and free any excesss
+  // memory, so we might as well start a brand new collection so it is as
+  // small as possible.
+
+  // First calculate the size of the new minimal arange vector
+  // so we don't have to do a bunch of re-allocations as we
+  // copy the new minimal stuff over to the new collection.
+  size_t minimal_size = 1;
+  for (size_t i = 1; i < orig_arange_size; ++i) {
+    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i], n))
+      ++minimal_size;
+  }
+
+  // If the sizes are the same, then no consecutive aranges can be
+  // combined, we are done.
+  if (minimal_size == orig_arange_size)
+    return;
+
+  // Else, make a new RangeColl that _only_ contains what we need.
+  RangeColl minimal_aranges;
+  minimal_aranges.resize(minimal_size);
+  uint32_t j = 0;
+  minimal_aranges[j] = Aranges[0];
+  for (size_t i = 1; i < orig_arange_size; ++i) {
+    if(Range::SortedOverlapCheck (minimal_aranges[j], Aranges[i], n)) {
+      minimal_aranges[j].setHiPC (Aranges[i].HiPC());
+    } else {
+      // Only increment j if we aren't merging
+      minimal_aranges[++j] = Aranges[i];
+    }
+  }
+  assert (j+1 == minimal_size);
+
+  // Now swap our new minimal aranges into place. The local
+  // minimal_aranges will then contian the old big collection
+  // which will get freed.
+  minimal_aranges.swap(Aranges);
+}
+
+uint32_t DWARFDebugAranges::findAddress(uint64_t address) const {
+  if (!Aranges.empty()) {
+    Range range(address);
+    RangeCollIterator begin = Aranges.begin();
+    RangeCollIterator end = Aranges.end();
+    RangeCollIterator pos = lower_bound(begin, end, range, RangeLessThan);
+
+    if (pos != end && pos->LoPC <= address && address < pos->HiPC()) {
+      return pos->Offset;
+    } else if (pos != begin) {
+      --pos;
+      if (pos->LoPC <= address && address < pos->HiPC())
+        return (*pos).Offset;
+    }
+  }
+  return -1U;
+}
+
+bool
+DWARFDebugAranges::allRangesAreContiguous(uint64_t &LoPC, uint64_t &HiPC) const{
+  if (Aranges.empty())
+    return false;
+
+  uint64_t next_addr = 0;
+  RangeCollIterator begin = Aranges.begin();
+  for (RangeCollIterator pos = begin, end = Aranges.end(); pos != end;
+       ++pos) {
+    if (pos != begin && pos->LoPC != next_addr)
+      return false;
+    next_addr = pos->HiPC();
+  }
+  // We checked for empty at the start of function so front() will be valid.
+  LoPC = Aranges.front().LoPC;
+  // We checked for empty at the start of function so back() will be valid.
+  HiPC = Aranges.back().HiPC();
+  return true;
+}
+
+bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const {
+  if (Aranges.empty())
+    return false;
+  // We checked for empty at the start of function so front() will be valid.
+  LoPC = Aranges.front().LoPC;
+  // We checked for empty at the start of function so back() will be valid.
+  HiPC = Aranges.back().HiPC();
+  return true;
+}
+
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
new file mode 100644
index 0000000..12afb60
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugAranges.h
@@ -0,0 +1,98 @@
+//===-- DWARFDebugAranges.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
+#define LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
+
+#include "DWARFDebugArangeSet.h"
+#include <list>
+
+namespace llvm {
+
+class DWARFContext;
+
+class DWARFDebugAranges {
+public:
+  struct Range {
+    explicit Range(uint64_t lo = -1ULL, uint64_t hi = -1ULL,
+                   uint32_t off = -1U)
+      : LoPC(lo), Length(hi-lo), Offset(off) {}
+
+    void clear() {
+      LoPC = -1ULL;
+      Length = 0;
+      Offset = -1U;
+    }
+
+    void setHiPC(uint64_t HiPC) {
+      if (HiPC == -1ULL || HiPC <= LoPC)
+        Length = 0;
+      else
+        Length = HiPC - LoPC;
+    }
+    uint64_t HiPC() const {
+      if (Length)
+        return LoPC + Length;
+      return -1ULL;
+    }
+    bool isValidRange() const { return Length > 0; }
+
+    static bool SortedOverlapCheck(const Range &curr_range,
+                                   const Range &next_range, uint32_t n) {
+      if (curr_range.Offset != next_range.Offset)
+        return false;
+      return curr_range.HiPC() + n >= next_range.LoPC;
+    }
+
+    bool contains(const Range &range) const {
+      return LoPC <= range.LoPC && range.HiPC() <= HiPC();
+    }
+
+    void dump(raw_ostream &OS) const;
+    uint64_t LoPC; // Start of address range
+    uint32_t Length; // End of address range (not including this address)
+    uint32_t Offset; // Offset of the compile unit or die
+  };
+
+  void clear() { Aranges.clear(); }
+  bool allRangesAreContiguous(uint64_t& LoPC, uint64_t& HiPC) const;
+  bool getMaxRange(uint64_t& LoPC, uint64_t& HiPC) const;
+  bool extract(DataExtractor debug_aranges_data);
+  bool generate(DWARFContext *ctx);
+
+  // Use append range multiple times and then call sort
+  void appendRange(uint32_t cu_offset, uint64_t low_pc, uint64_t high_pc);
+  void sort(bool minimize, uint32_t n);
+
+  const Range *rangeAtIndex(uint32_t idx) const {
+    if (idx < Aranges.size())
+      return &Aranges[idx];
+    return NULL;
+  }
+  void dump(raw_ostream &OS) const;
+  uint32_t findAddress(uint64_t address) const;
+  bool isEmpty() const { return Aranges.empty(); }
+  uint32_t getNumRanges() const { return Aranges.size(); }
+
+  uint32_t offsetAtIndex(uint32_t idx) const {
+    if (idx < Aranges.size())
+      return Aranges[idx].Offset;
+    return -1U;
+  }
+
+  typedef std::vector<Range>              RangeColl;
+  typedef RangeColl::const_iterator       RangeCollIterator;
+
+private:
+  RangeColl Aranges;
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
new file mode 100644
index 0000000..1b089ad
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -0,0 +1,444 @@
+//===-- DWARFDebugInfoEntry.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugInfoEntry.h"
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "DWARFDebugAbbrev.h"
+#include "DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+using namespace dwarf;
+
+void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
+                                      const DWARFCompileUnit *cu,
+                                      unsigned recurseDepth,
+                                      unsigned indent) const {
+  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  uint32_t offset = Offset;
+
+  if (debug_info_data.isValidOffset(offset)) {
+    uint64_t abbrCode = debug_info_data.getULEB128(&offset);
+
+    OS << format("\n0x%8.8x: ", Offset);
+    if (abbrCode) {
+      if (AbbrevDecl) {
+        const char *tagString = TagString(getTag());
+        if (tagString)
+          OS.indent(indent) << tagString;
+        else
+          OS.indent(indent) << format("DW_TAG_Unknown_%x", getTag());
+        OS << format(" [%u] %c\n", abbrCode,
+                     AbbrevDecl->hasChildren() ? '*' : ' ');
+
+        // Dump all data in the .debug_info for the attributes
+        const uint32_t numAttributes = AbbrevDecl->getNumAttributes();
+        for (uint32_t i = 0; i != numAttributes; ++i) {
+          uint16_t attr = AbbrevDecl->getAttrByIndex(i);
+          uint16_t form = AbbrevDecl->getFormByIndex(i);
+          dumpAttribute(OS, cu, &offset, attr, form, indent);
+        }
+
+        const DWARFDebugInfoEntryMinimal *child = getFirstChild();
+        if (recurseDepth > 0 && child) {
+          while (child) {
+            child->dump(OS, cu, recurseDepth-1, indent+2);
+            child = child->getSibling();
+          }
+        }
+      } else {
+        OS << "Abbreviation code not found in 'debug_abbrev' class for code: "
+           << abbrCode << '\n';
+      }
+    } else {
+      OS.indent(indent) << "NULL\n";
+    }
+  }
+}
+
+void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
+                                               const DWARFCompileUnit *cu,
+                                               uint32_t* offset_ptr,
+                                               uint16_t attr,
+                                               uint16_t form,
+                                               unsigned indent) const {
+  OS << format("0x%8.8x: ", *offset_ptr);
+  OS.indent(indent+2);
+  const char *attrString = AttributeString(attr);
+  if (attrString)
+    OS << attrString;
+  else
+    OS << format("DW_AT_Unknown_%x", attr);
+  const char *formString = FormEncodingString(form);
+  if (formString)
+    OS << " [" << formString << ']';
+  else
+    OS << format(" [DW_FORM_Unknown_%x]", form);
+
+  DWARFFormValue formValue(form);
+
+  if (!formValue.extractValue(cu->getDebugInfoExtractor(), offset_ptr, cu))
+    return;
+
+  OS << "\t(";
+  formValue.dump(OS, cu);
+  OS << ")\n";
+}
+
+bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
+                                             const uint8_t *fixed_form_sizes,
+                                             uint32_t *offset_ptr) {
+  Offset = *offset_ptr;
+
+  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  uint64_t abbrCode = debug_info_data.getULEB128(offset_ptr);
+
+  assert (fixed_form_sizes); // For best performance this should be specified!
+
+  if (abbrCode) {
+    uint32_t offset = *offset_ptr;
+
+    AbbrevDecl = cu->getAbbreviations()->getAbbreviationDeclaration(abbrCode);
+
+    // Skip all data in the .debug_info for the attributes
+    const uint32_t numAttributes = AbbrevDecl->getNumAttributes();
+    uint32_t i;
+    uint16_t form;
+    for (i=0; i<numAttributes; ++i) {
+      form = AbbrevDecl->getFormByIndex(i);
+
+      const uint8_t fixed_skip_size = fixed_form_sizes[form];
+      if (fixed_skip_size)
+        offset += fixed_skip_size;
+      else {
+        bool form_is_indirect = false;
+        do {
+          form_is_indirect = false;
+          uint32_t form_size = 0;
+          switch (form) {
+          // Blocks if inlined data that have a length field and the data bytes
+          // inlined in the .debug_info.
+          case DW_FORM_block:
+            form_size = debug_info_data.getULEB128(&offset);
+            break;
+          case DW_FORM_block1:
+            form_size = debug_info_data.getU8(&offset);
+            break;
+          case DW_FORM_block2:
+            form_size = debug_info_data.getU16(&offset);
+            break;
+          case DW_FORM_block4:
+            form_size = debug_info_data.getU32(&offset);
+            break;
+
+          // Inlined NULL terminated C-strings
+          case DW_FORM_string:
+            debug_info_data.getCStr(&offset);
+            break;
+
+          // Compile unit address sized values
+          case DW_FORM_addr:
+          case DW_FORM_ref_addr:
+            form_size = cu->getAddressByteSize();
+            break;
+
+          // 1 byte values
+          case DW_FORM_data1:
+          case DW_FORM_flag:
+          case DW_FORM_ref1:
+            form_size = 1;
+            break;
+
+          // 2 byte values
+          case DW_FORM_data2:
+          case DW_FORM_ref2:
+            form_size = 2;
+            break;
+
+          // 4 byte values
+          case DW_FORM_strp:
+          case DW_FORM_data4:
+          case DW_FORM_ref4:
+            form_size = 4;
+            break;
+
+          // 8 byte values
+          case DW_FORM_data8:
+          case DW_FORM_ref8:
+            form_size = 8;
+            break;
+
+          // signed or unsigned LEB 128 values
+          case DW_FORM_sdata:
+          case DW_FORM_udata:
+          case DW_FORM_ref_udata:
+            debug_info_data.getULEB128(&offset);
+            break;
+
+          case DW_FORM_indirect:
+            form_is_indirect = true;
+            form = debug_info_data.getULEB128(&offset);
+            break;
+
+          default:
+            *offset_ptr = Offset;
+            return false;
+          }
+          offset += form_size;
+
+        } while (form_is_indirect);
+      }
+    }
+    *offset_ptr = offset;
+    return true;
+  } else {
+    AbbrevDecl = NULL;
+    return true; // NULL debug tag entry
+  }
+
+  return false;
+}
+
+bool
+DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
+                                    uint32_t *offset_ptr) {
+  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  const uint32_t cu_end_offset = cu->getNextCompileUnitOffset();
+  const uint8_t cu_addr_size = cu->getAddressByteSize();
+  uint32_t offset = *offset_ptr;
+  if ((offset < cu_end_offset) && debug_info_data.isValidOffset(offset)) {
+    Offset = offset;
+
+    uint64_t abbrCode = debug_info_data.getULEB128(&offset);
+
+    if (abbrCode) {
+      AbbrevDecl = cu->getAbbreviations()->getAbbreviationDeclaration(abbrCode);
+
+      if (AbbrevDecl) {
+        uint16_t tag = AbbrevDecl->getTag();
+
+        bool isCompileUnitTag = tag == DW_TAG_compile_unit;
+        if(cu && isCompileUnitTag)
+          const_cast<DWARFCompileUnit*>(cu)->setBaseAddress(0);
+
+        // Skip all data in the .debug_info for the attributes
+        const uint32_t numAttributes = AbbrevDecl->getNumAttributes();
+        for (uint32_t i = 0; i != numAttributes; ++i) {
+          uint16_t attr = AbbrevDecl->getAttrByIndex(i);
+          uint16_t form = AbbrevDecl->getFormByIndex(i);
+
+          if (isCompileUnitTag &&
+              ((attr == DW_AT_entry_pc) || (attr == DW_AT_low_pc))) {
+            DWARFFormValue form_value(form);
+            if (form_value.extractValue(debug_info_data, &offset, cu)) {
+              if (attr == DW_AT_low_pc || attr == DW_AT_entry_pc)
+                const_cast<DWARFCompileUnit*>(cu)
+                  ->setBaseAddress(form_value.getUnsigned());
+            }
+          } else {
+            bool form_is_indirect = false;
+            do {
+              form_is_indirect = false;
+              register uint32_t form_size = 0;
+              switch (form) {
+              // Blocks if inlined data that have a length field and the data
+              // bytes // inlined in the .debug_info
+              case DW_FORM_block:
+                form_size = debug_info_data.getULEB128(&offset);
+                break;
+              case DW_FORM_block1:
+                form_size = debug_info_data.getU8(&offset);
+                break;
+              case DW_FORM_block2:
+                form_size = debug_info_data.getU16(&offset);
+                break;
+              case DW_FORM_block4:
+                form_size = debug_info_data.getU32(&offset);
+                break;
+
+              // Inlined NULL terminated C-strings
+              case DW_FORM_string:
+                debug_info_data.getCStr(&offset);
+                break;
+
+              // Compile unit address sized values
+              case DW_FORM_addr:
+              case DW_FORM_ref_addr:
+                form_size = cu_addr_size;
+                break;
+
+              // 1 byte values
+              case DW_FORM_data1:
+              case DW_FORM_flag:
+              case DW_FORM_ref1:
+                form_size = 1;
+                break;
+
+              // 2 byte values
+              case DW_FORM_data2:
+              case DW_FORM_ref2:
+                form_size = 2;
+                break;
+
+                // 4 byte values
+              case DW_FORM_strp:
+                form_size = 4;
+                break;
+
+              case DW_FORM_data4:
+              case DW_FORM_ref4:
+                form_size = 4;
+                break;
+
+              // 8 byte values
+              case DW_FORM_data8:
+              case DW_FORM_ref8:
+                form_size = 8;
+                break;
+
+              // signed or unsigned LEB 128 values
+              case DW_FORM_sdata:
+              case DW_FORM_udata:
+              case DW_FORM_ref_udata:
+                debug_info_data.getULEB128(&offset);
+                break;
+
+              case DW_FORM_indirect:
+                form = debug_info_data.getULEB128(&offset);
+                form_is_indirect = true;
+                break;
+
+              default:
+                *offset_ptr = offset;
+                return false;
+              }
+
+              offset += form_size;
+            } while (form_is_indirect);
+          }
+        }
+        *offset_ptr = offset;
+        return true;
+      }
+    } else {
+      AbbrevDecl = NULL;
+      *offset_ptr = offset;
+      return true;    // NULL debug tag entry
+    }
+  }
+
+  return false;
+}
+
+uint32_t
+DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
+                                              const uint16_t attr,
+                                              DWARFFormValue &form_value,
+                                              uint32_t *end_attr_offset_ptr)
+                                              const {
+  if (AbbrevDecl) {
+    uint32_t attr_idx = AbbrevDecl->findAttributeIndex(attr);
+
+    if (attr_idx != -1U) {
+      uint32_t offset = getOffset();
+
+      DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+
+      // Skip the abbreviation code so we are at the data for the attributes
+      debug_info_data.getULEB128(&offset);
+
+      uint32_t idx = 0;
+      while (idx < attr_idx)
+        DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(idx++),
+                                  debug_info_data, &offset, cu);
+
+      const uint32_t attr_offset = offset;
+      form_value = DWARFFormValue(AbbrevDecl->getFormByIndex(idx));
+      if (form_value.extractValue(debug_info_data, &offset, cu)) {
+        if (end_attr_offset_ptr)
+          *end_attr_offset_ptr = offset;
+        return attr_offset;
+      }
+    }
+  }
+
+  return 0;
+}
+
+const char*
+DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
+    const DWARFCompileUnit* cu,
+    const uint16_t attr,
+    const char* fail_value) const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value)) {
+    DataExtractor stringExtractor(cu->getContext().getStringSection(),
+        false, 0);
+    return form_value.getAsCString(&stringExtractor);
+  }
+  return fail_value;
+}
+
+uint64_t
+DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsigned(
+    const DWARFCompileUnit* cu,
+    const uint16_t attr,
+    uint64_t fail_value) const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value))
+      return form_value.getUnsigned();
+  return fail_value;
+}
+
+int64_t
+DWARFDebugInfoEntryMinimal::getAttributeValueAsSigned(
+    const DWARFCompileUnit* cu,
+    const uint16_t attr,
+    int64_t fail_value) const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value))
+      return form_value.getSigned();
+  return fail_value;
+}
+
+uint64_t
+DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
+                                                  const DWARFCompileUnit* cu,
+                                                  const uint16_t attr,
+                                                  uint64_t fail_value) const {
+  DWARFFormValue form_value;
+  if (getAttributeValue(cu, attr, form_value))
+      return form_value.getReference(cu);
+  return fail_value;
+}
+
+void
+DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *cu,
+                                               DWARFDebugAranges *debug_aranges)
+                                                   const {
+  if (AbbrevDecl) {
+    uint16_t tag = AbbrevDecl->getTag();
+    if (tag == DW_TAG_subprogram) {
+      uint64_t hi_pc = -1ULL;
+      uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL);
+      if (lo_pc != -1ULL)
+        hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL);
+      if (hi_pc != -1ULL)
+        debug_aranges->appendRange(cu->getOffset(), lo_pc, hi_pc);
+    }
+
+    const DWARFDebugInfoEntryMinimal *child = getFirstChild();
+    while (child) {
+      child->buildAddressRangeTable(cu, debug_aranges);
+      child = child->getSibling();
+    }
+  }
+}
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
new file mode 100644
index 0000000..aff2e85
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -0,0 +1,135 @@
+//===-- DWARFDebugInfoEntry.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+#define LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+
+#include "DWARFAbbreviationDeclaration.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class DWARFDebugAranges;
+class DWARFCompileUnit;
+class DWARFContext;
+class DWARFFormValue;
+
+/// DWARFDebugInfoEntryMinimal - A DIE with only the minimum required data.
+class DWARFDebugInfoEntryMinimal {
+  /// Offset within the .debug_info of the start of this entry.
+  uint64_t Offset;
+
+  /// How many to subtract from "this" to get the parent.
+  /// If zero this die has no parent.
+  uint32_t ParentIdx;
+
+  /// How many to add to "this" to get the sibling.
+  uint32_t SiblingIdx;
+
+  const DWARFAbbreviationDeclaration *AbbrevDecl;
+public:
+  DWARFDebugInfoEntryMinimal()
+    : Offset(0), ParentIdx(0), SiblingIdx(0), AbbrevDecl(0) {}
+
+  void dump(raw_ostream &OS, const DWARFCompileUnit *cu,
+            unsigned recurseDepth, unsigned indent = 0) const;
+  void dumpAttribute(raw_ostream &OS, const DWARFCompileUnit *cu,
+                     uint32_t *offset_ptr, uint16_t attr, uint16_t form,
+                     unsigned indent = 0) const;
+
+  bool extractFast(const DWARFCompileUnit *cu, const uint8_t *fixed_form_sizes,
+                   uint32_t *offset_ptr);
+
+  /// Extract a debug info entry for a given compile unit from the
+  /// .debug_info and .debug_abbrev data starting at the given offset.
+  bool extract(const DWARFCompileUnit *cu, uint32_t *offset_ptr);
+
+  uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
+  bool isNULL() const { return AbbrevDecl == 0; }
+  uint64_t getOffset() const { return Offset; }
+  uint32_t getNumAttributes() const {
+    return !isNULL() ? AbbrevDecl->getNumAttributes() : 0;
+  }
+  bool hasChildren() const { return !isNULL() && AbbrevDecl->hasChildren(); }
+
+  // We know we are kept in a vector of contiguous entries, so we know
+  // our parent will be some index behind "this".
+  DWARFDebugInfoEntryMinimal *getParent() {
+    return ParentIdx > 0 ? this - ParentIdx : 0;
+  }
+  const DWARFDebugInfoEntryMinimal *getParent() const {
+    return ParentIdx > 0 ? this - ParentIdx : 0;
+  }
+  // We know we are kept in a vector of contiguous entries, so we know
+  // our sibling will be some index after "this".
+  DWARFDebugInfoEntryMinimal *getSibling() {
+    return SiblingIdx > 0 ? this + SiblingIdx : 0;
+  }
+  const DWARFDebugInfoEntryMinimal *getSibling() const {
+    return SiblingIdx > 0 ? this + SiblingIdx : 0;
+  }
+  // We know we are kept in a vector of contiguous entries, so we know
+  // we don't need to store our child pointer, if we have a child it will
+  // be the next entry in the list...
+  DWARFDebugInfoEntryMinimal *getFirstChild() {
+    return hasChildren() ? this + 1 : 0;
+  }
+  const DWARFDebugInfoEntryMinimal *getFirstChild() const {
+    return hasChildren() ? this + 1 : 0;
+  }
+
+  void setParent(DWARFDebugInfoEntryMinimal *parent) {
+    if (parent) {
+      // We know we are kept in a vector of contiguous entries, so we know
+      // our parent will be some index behind "this".
+      ParentIdx = this - parent;
+    } else
+      ParentIdx = 0;
+  }
+  void setSibling(DWARFDebugInfoEntryMinimal *sibling) {
+    if (sibling) {
+      // We know we are kept in a vector of contiguous entries, so we know
+      // our sibling will be some index after "this".
+      SiblingIdx = sibling - this;
+      sibling->setParent(getParent());
+    } else
+      SiblingIdx = 0;
+  }
+
+  const DWARFAbbreviationDeclaration *getAbbreviationDeclarationPtr() const {
+    return AbbrevDecl;
+  }
+
+  uint32_t getAttributeValue(const DWARFCompileUnit *cu,
+                             const uint16_t attr, DWARFFormValue &formValue,
+                             uint32_t *end_attr_offset_ptr = 0) const;
+
+  const char* getAttributeValueAsString(const DWARFCompileUnit* cu,
+                                        const uint16_t attr,
+                                        const char *fail_value) const;
+
+  uint64_t getAttributeValueAsUnsigned(const DWARFCompileUnit *cu,
+                                       const uint16_t attr,
+                                       uint64_t fail_value) const;
+
+  uint64_t getAttributeValueAsReference(const DWARFCompileUnit *cu,
+                                        const uint16_t attr,
+                                        uint64_t fail_value) const;
+
+  int64_t getAttributeValueAsSigned(const DWARFCompileUnit* cu,
+                                    const uint16_t attr,
+                                    int64_t fail_value) const;
+
+  void buildAddressRangeTable(const DWARFCompileUnit *cu,
+                              DWARFDebugAranges *debug_aranges) const;
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
new file mode 100644
index 0000000..fe1ef78
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -0,0 +1,475 @@
+//===-- DWARFDebugLine.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugLine.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+using namespace dwarf;
+
+void DWARFDebugLine::Prologue::dump(raw_ostream &OS) const {
+  OS << "Line table prologue:\n"
+     << format("   total_length: 0x%8.8x\n", TotalLength)
+     << format("        version: %u\n", Version)
+     << format("prologue_length: 0x%8.8x\n", PrologueLength)
+     << format("min_inst_length: %u\n", MinInstLength)
+     << format("default_is_stmt: %u\n", DefaultIsStmt)
+     << format("      line_base: %i\n", LineBase)
+     << format("     line_range: %u\n", LineRange)
+     << format("    opcode_base: %u\n", OpcodeBase);
+
+  for (uint32_t i = 0; i < StandardOpcodeLengths.size(); ++i)
+    OS << format("standard_opcode_lengths[%s] = %u\n", LNStandardString(i+1),
+                 StandardOpcodeLengths[i]);
+
+  if (!IncludeDirectories.empty())
+    for (uint32_t i = 0; i < IncludeDirectories.size(); ++i)
+      OS << format("include_directories[%3u] = '", i+1)
+         << IncludeDirectories[i] << "'\n";
+
+  if (!FileNames.empty()) {
+    OS << "                Dir  Mod Time   File Len   File Name\n"
+       << "                ---- ---------- ---------- -----------"
+          "----------------\n";
+    for (uint32_t i = 0; i < FileNames.size(); ++i) {
+      const FileNameEntry& fileEntry = FileNames[i];
+      OS << format("file_names[%3u] %4u ", i+1, fileEntry.DirIdx)
+         << format("0x%8.8x 0x%8.8x ", fileEntry.ModTime, fileEntry.Length)
+         << fileEntry.Name << '\n';
+    }
+  }
+}
+
+void DWARFDebugLine::Row::postAppend() {
+  BasicBlock = false;
+  PrologueEnd = false;
+  EpilogueBegin = false;
+}
+
+void DWARFDebugLine::Row::reset(bool default_is_stmt) {
+  Address = 0;
+  Line = 1;
+  Column = 0;
+  File = 1;
+  Isa = 0;
+  IsStmt = default_is_stmt;
+  BasicBlock = false;
+  EndSequence = false;
+  PrologueEnd = false;
+  EpilogueBegin = false;
+}
+
+void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
+  OS << format("0x%16.16llx %6u %6u", Address, Line, Column)
+     << format(" %6u %3u ", File, Isa)
+     << (IsStmt ? " is_stmt" : "")
+     << (BasicBlock ? " basic_block" : "")
+     << (PrologueEnd ? " prologue_end" : "")
+     << (EpilogueBegin ? " epilogue_begin" : "")
+     << (EndSequence ? " end_sequence" : "")
+     << '\n';
+}
+
+void DWARFDebugLine::LineTable::dump(raw_ostream &OS) const {
+  Prologue.dump(OS);
+  OS << '\n';
+
+  if (!Rows.empty()) {
+    OS << "Address            Line   Column File   ISA Flags\n"
+       << "------------------ ------ ------ ------ --- -------------\n";
+    for (std::vector<Row>::const_iterator pos = Rows.begin(),
+         end = Rows.end(); pos != end; ++pos)
+      pos->dump(OS);
+  }
+}
+
+DWARFDebugLine::State::~State() {}
+
+void DWARFDebugLine::State::appendRowToMatrix(uint32_t offset) {
+  ++row;  // Increase the row number.
+  LineTable::appendRow(*this);
+  Row::postAppend();
+}
+
+DWARFDebugLine::DumpingState::~DumpingState() {}
+
+void DWARFDebugLine::DumpingState::finalize(uint32_t offset) {
+  LineTable::dump(OS);
+}
+
+const DWARFDebugLine::LineTable *
+DWARFDebugLine::getLineTable(uint32_t offset) const {
+  LineTableConstIter pos = LineTableMap.find(offset);
+  if (pos != LineTableMap.end())
+    return &pos->second;
+  return 0;
+}
+
+const DWARFDebugLine::LineTable *
+DWARFDebugLine::getOrParseLineTable(DataExtractor debug_line_data,
+                                    uint32_t offset) {
+  std::pair<LineTableIter, bool> pos =
+    LineTableMap.insert(LineTableMapTy::value_type(offset, LineTable()));
+  if (pos.second) {
+    // Parse and cache the line table for at this offset.
+    State state;
+    if (!parseStatementTable(debug_line_data, &offset, state))
+      return 0;
+    pos.first->second = state;
+  }
+  return &pos.first->second;
+}
+
+bool
+DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
+                              uint32_t *offset_ptr, Prologue *prologue) {
+  const uint32_t prologue_offset = *offset_ptr;
+
+  prologue->clear();
+  prologue->TotalLength = debug_line_data.getU32(offset_ptr);
+  prologue->Version = debug_line_data.getU16(offset_ptr);
+  if (prologue->Version != 2)
+    return false;
+
+  prologue->PrologueLength = debug_line_data.getU32(offset_ptr);
+  const uint32_t end_prologue_offset = prologue->PrologueLength + *offset_ptr;
+  prologue->MinInstLength = debug_line_data.getU8(offset_ptr);
+  prologue->DefaultIsStmt = debug_line_data.getU8(offset_ptr);
+  prologue->LineBase = debug_line_data.getU8(offset_ptr);
+  prologue->LineRange = debug_line_data.getU8(offset_ptr);
+  prologue->OpcodeBase = debug_line_data.getU8(offset_ptr);
+
+  prologue->StandardOpcodeLengths.reserve(prologue->OpcodeBase-1);
+  for (uint32_t i = 1; i < prologue->OpcodeBase; ++i) {
+    uint8_t op_len = debug_line_data.getU8(offset_ptr);
+    prologue->StandardOpcodeLengths.push_back(op_len);
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *s = debug_line_data.getCStr(offset_ptr);
+    if (s && s[0])
+      prologue->IncludeDirectories.push_back(s);
+    else
+      break;
+  }
+
+  while (*offset_ptr < end_prologue_offset) {
+    const char *name = debug_line_data.getCStr(offset_ptr);
+    if (name && name[0]) {
+      FileNameEntry fileEntry;
+      fileEntry.Name = name;
+      fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
+      fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
+      prologue->FileNames.push_back(fileEntry);
+    } else {
+      break;
+    }
+  }
+
+  if (*offset_ptr != end_prologue_offset) {
+    fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
+                    " have ended at 0x%8.8x but it ended ad 0x%8.8x\n",
+            prologue_offset, end_prologue_offset, *offset_ptr);
+  }
+  return end_prologue_offset;
+}
+
+bool
+DWARFDebugLine::parseStatementTable(DataExtractor debug_line_data,
+                                    uint32_t *offset_ptr, State &state) {
+  const uint32_t debug_line_offset = *offset_ptr;
+
+  Prologue *prologue = &state.Prologue;
+
+  if (!parsePrologue(debug_line_data, offset_ptr, prologue)) {
+    // Restore our offset and return false to indicate failure!
+    *offset_ptr = debug_line_offset;
+    return false;
+  }
+
+  const uint32_t end_offset = debug_line_offset + prologue->TotalLength +
+                              sizeof(prologue->TotalLength);
+
+  state.reset();
+
+  while (*offset_ptr < end_offset) {
+    uint8_t opcode = debug_line_data.getU8(offset_ptr);
+
+    if (opcode == 0) {
+      // Extended Opcodes always start with a zero opcode followed by
+      // a uleb128 length so you can skip ones you don't know about
+      uint32_t ext_offset = *offset_ptr;
+      uint64_t len = debug_line_data.getULEB128(offset_ptr);
+      uint32_t arg_size = len - (*offset_ptr - ext_offset);
+
+      uint8_t sub_opcode = debug_line_data.getU8(offset_ptr);
+      switch (sub_opcode) {
+      case DW_LNE_end_sequence:
+        // Set the end_sequence register of the state machine to true and
+        // append a row to the matrix using the current values of the
+        // state-machine registers. Then reset the registers to the initial
+        // values specified above. Every statement program sequence must end
+        // with a DW_LNE_end_sequence instruction which creates a row whose
+        // address is that of the byte after the last target machine instruction
+        // of the sequence.
+        state.EndSequence = true;
+        state.appendRowToMatrix(*offset_ptr);
+        state.reset();
+        break;
+
+      case DW_LNE_set_address:
+        // Takes a single relocatable address as an operand. The size of the
+        // operand is the size appropriate to hold an address on the target
+        // machine. Set the address register to the value given by the
+        // relocatable address. All of the other statement program opcodes
+        // that affect the address register add a delta to it. This instruction
+        // stores a relocatable value into it instead.
+        state.Address = debug_line_data.getAddress(offset_ptr);
+        break;
+
+      case DW_LNE_define_file:
+        // Takes 4 arguments. The first is a null terminated string containing
+        // a source file name. The second is an unsigned LEB128 number
+        // representing the directory index of the directory in which the file
+        // was found. The third is an unsigned LEB128 number representing the
+        // time of last modification of the file. The fourth is an unsigned
+        // LEB128 number representing the length in bytes of the file. The time
+        // and length fields may contain LEB128(0) if the information is not
+        // available.
+        //
+        // The directory index represents an entry in the include_directories
+        // section of the statement program prologue. The index is LEB128(0)
+        // if the file was found in the current directory of the compilation,
+        // LEB128(1) if it was found in the first directory in the
+        // include_directories section, and so on. The directory index is
+        // ignored for file names that represent full path names.
+        //
+        // The files are numbered, starting at 1, in the order in which they
+        // appear; the names in the prologue come before names defined by
+        // the DW_LNE_define_file instruction. These numbers are used in the
+        // the file register of the state machine.
+        {
+          FileNameEntry fileEntry;
+          fileEntry.Name = debug_line_data.getCStr(offset_ptr);
+          fileEntry.DirIdx = debug_line_data.getULEB128(offset_ptr);
+          fileEntry.ModTime = debug_line_data.getULEB128(offset_ptr);
+          fileEntry.Length = debug_line_data.getULEB128(offset_ptr);
+          prologue->FileNames.push_back(fileEntry);
+        }
+        break;
+
+      default:
+        // Length doesn't include the zero opcode byte or the length itself, but
+        // it does include the sub_opcode, so we have to adjust for that below
+        (*offset_ptr) += arg_size;
+        break;
+      }
+    } else if (opcode < prologue->OpcodeBase) {
+      switch (opcode) {
+      // Standard Opcodes
+      case DW_LNS_copy:
+        // Takes no arguments. Append a row to the matrix using the
+        // current values of the state-machine registers. Then set
+        // the basic_block register to false.
+        state.appendRowToMatrix(*offset_ptr);
+        break;
+
+      case DW_LNS_advance_pc:
+        // Takes a single unsigned LEB128 operand, multiplies it by the
+        // min_inst_length field of the prologue, and adds the
+        // result to the address register of the state machine.
+        state.Address += debug_line_data.getULEB128(offset_ptr) *
+                         prologue->MinInstLength;
+        break;
+
+      case DW_LNS_advance_line:
+        // Takes a single signed LEB128 operand and adds that value to
+        // the line register of the state machine.
+        state.Line += debug_line_data.getSLEB128(offset_ptr);
+        break;
+
+      case DW_LNS_set_file:
+        // Takes a single unsigned LEB128 operand and stores it in the file
+        // register of the state machine.
+        state.File = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      case DW_LNS_set_column:
+        // Takes a single unsigned LEB128 operand and stores it in the
+        // column register of the state machine.
+        state.Column = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      case DW_LNS_negate_stmt:
+        // Takes no arguments. Set the is_stmt register of the state
+        // machine to the logical negation of its current value.
+        state.IsStmt = !state.IsStmt;
+        break;
+
+      case DW_LNS_set_basic_block:
+        // Takes no arguments. Set the basic_block register of the
+        // state machine to true
+        state.BasicBlock = true;
+        break;
+
+      case DW_LNS_const_add_pc:
+        // Takes no arguments. Add to the address register of the state
+        // machine the address increment value corresponding to special
+        // opcode 255. The motivation for DW_LNS_const_add_pc is this:
+        // when the statement program needs to advance the address by a
+        // small amount, it can use a single special opcode, which occupies
+        // a single byte. When it needs to advance the address by up to
+        // twice the range of the last special opcode, it can use
+        // DW_LNS_const_add_pc followed by a special opcode, for a total
+        // of two bytes. Only if it needs to advance the address by more
+        // than twice that range will it need to use both DW_LNS_advance_pc
+        // and a special opcode, requiring three or more bytes.
+        {
+          uint8_t adjust_opcode = 255 - prologue->OpcodeBase;
+          uint64_t addr_offset = (adjust_opcode / prologue->LineRange) *
+                                 prologue->MinInstLength;
+          state.Address += addr_offset;
+        }
+        break;
+
+      case DW_LNS_fixed_advance_pc:
+        // Takes a single uhalf operand. Add to the address register of
+        // the state machine the value of the (unencoded) operand. This
+        // is the only extended opcode that takes an argument that is not
+        // a variable length number. The motivation for DW_LNS_fixed_advance_pc
+        // is this: existing assemblers cannot emit DW_LNS_advance_pc or
+        // special opcodes because they cannot encode LEB128 numbers or
+        // judge when the computation of a special opcode overflows and
+        // requires the use of DW_LNS_advance_pc. Such assemblers, however,
+        // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
+        state.Address += debug_line_data.getU16(offset_ptr);
+        break;
+
+      case DW_LNS_set_prologue_end:
+        // Takes no arguments. Set the prologue_end register of the
+        // state machine to true
+        state.PrologueEnd = true;
+        break;
+
+      case DW_LNS_set_epilogue_begin:
+        // Takes no arguments. Set the basic_block register of the
+        // state machine to true
+        state.EpilogueBegin = true;
+        break;
+
+      case DW_LNS_set_isa:
+        // Takes a single unsigned LEB128 operand and stores it in the
+        // column register of the state machine.
+        state.Isa = debug_line_data.getULEB128(offset_ptr);
+        break;
+
+      default:
+        // Handle any unknown standard opcodes here. We know the lengths
+        // of such opcodes because they are specified in the prologue
+        // as a multiple of LEB128 operands for each opcode.
+        {
+          assert(opcode - 1U < prologue->StandardOpcodeLengths.size());
+          uint8_t opcode_length = prologue->StandardOpcodeLengths[opcode - 1];
+          for (uint8_t i=0; i<opcode_length; ++i)
+            debug_line_data.getULEB128(offset_ptr);
+        }
+        break;
+      }
+    } else {
+      // Special Opcodes
+
+      // A special opcode value is chosen based on the amount that needs
+      // to be added to the line and address registers. The maximum line
+      // increment for a special opcode is the value of the line_base
+      // field in the header, plus the value of the line_range field,
+      // minus 1 (line base + line range - 1). If the desired line
+      // increment is greater than the maximum line increment, a standard
+      // opcode must be used instead of a special opcode. The "address
+      // advance" is calculated by dividing the desired address increment
+      // by the minimum_instruction_length field from the header. The
+      // special opcode is then calculated using the following formula:
+      //
+      //  opcode = (desired line increment - line_base) +
+      //           (line_range * address advance) + opcode_base
+      //
+      // If the resulting opcode is greater than 255, a standard opcode
+      // must be used instead.
+      //
+      // To decode a special opcode, subtract the opcode_base from the
+      // opcode itself to give the adjusted opcode. The amount to
+      // increment the address register is the result of the adjusted
+      // opcode divided by the line_range multiplied by the
+      // minimum_instruction_length field from the header. That is:
+      //
+      //  address increment = (adjusted opcode / line_range) *
+      //                      minimum_instruction_length
+      //
+      // The amount to increment the line register is the line_base plus
+      // the result of the adjusted opcode modulo the line_range. That is:
+      //
+      // line increment = line_base + (adjusted opcode % line_range)
+
+      uint8_t adjust_opcode = opcode - prologue->OpcodeBase;
+      uint64_t addr_offset = (adjust_opcode / prologue->LineRange) *
+                             prologue->MinInstLength;
+      int32_t line_offset = prologue->LineBase +
+                            (adjust_opcode % prologue->LineRange);
+      state.Line += line_offset;
+      state.Address += addr_offset;
+      state.appendRowToMatrix(*offset_ptr);
+    }
+  }
+
+  state.finalize(*offset_ptr);
+
+  return end_offset;
+}
+
+static bool findMatchingAddress(const DWARFDebugLine::Row& row1,
+                                const DWARFDebugLine::Row& row2) {
+  return row1.Address < row2.Address;
+}
+
+uint32_t
+DWARFDebugLine::LineTable::lookupAddress(uint64_t address,
+                                         uint64_t cu_high_pc) const {
+  uint32_t index = UINT32_MAX;
+  if (!Rows.empty()) {
+    // Use the lower_bound algorithm to perform a binary search since we know
+    // that our line table data is ordered by address.
+    DWARFDebugLine::Row row;
+    row.Address = address;
+    typedef std::vector<Row>::const_iterator iterator;
+    iterator begin_pos = Rows.begin();
+    iterator end_pos = Rows.end();
+    iterator pos = std::lower_bound(begin_pos, end_pos, row,
+                                    findMatchingAddress);
+    if (pos == end_pos) {
+      if (address < cu_high_pc)
+        return Rows.size()-1;
+    } else {
+      // Rely on fact that we are using a std::vector and we can do
+      // pointer arithmetic to find the row index (which will be one less
+      // that what we found since it will find the first position after
+      // the current address) since std::vector iterators are just
+      // pointers to the container type.
+      index = pos - begin_pos;
+      if (pos->Address > address) {
+        if (index > 0)
+          --index;
+        else
+          index = UINT32_MAX;
+      }
+    }
+  }
+  return index; // Failed to find address.
+}
diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
new file mode 100644
index 0000000..bc6a70b
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugLine.h
@@ -0,0 +1,190 @@
+//===-- DWARFDebugLine.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGLINE_H
+#define LLVM_DEBUGINFO_DWARFDEBUGLINE_H
+
+#include "llvm/Support/DataExtractor.h"
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugLine {
+public:
+  struct FileNameEntry {
+    FileNameEntry() : DirIdx(0), ModTime(0), Length(0) {}
+
+    std::string Name;
+    uint64_t DirIdx;
+    uint64_t ModTime;
+    uint64_t Length;
+  };
+
+  struct Prologue {
+    Prologue()
+      : TotalLength(0), Version(0), PrologueLength(0), MinInstLength(0),
+        DefaultIsStmt(0), LineBase(0), LineRange(0), OpcodeBase(0) {}
+
+    // The size in bytes of the statement information for this compilation unit
+    // (not including the total_length field itself).
+    uint32_t TotalLength;
+    // Version identifier for the statement information format.
+    uint16_t Version;
+    // The number of bytes following the prologue_length field to the beginning
+    // of the first byte of the statement program itself.
+    uint32_t PrologueLength;
+    // The size in bytes of the smallest target machine instruction. Statement
+    // program opcodes that alter the address register first multiply their
+    // operands by this value.
+    uint8_t MinInstLength;
+    // The initial value of theis_stmtregister.
+    uint8_t DefaultIsStmt;
+    // This parameter affects the meaning of the special opcodes. See below.
+    int8_t LineBase;
+    // This parameter affects the meaning of the special opcodes. See below.
+    uint8_t LineRange;
+    // The number assigned to the first special opcode.
+    uint8_t OpcodeBase;
+    std::vector<uint8_t> StandardOpcodeLengths;
+    std::vector<std::string> IncludeDirectories;
+    std::vector<FileNameEntry> FileNames;
+
+    // Length of the prologue in bytes.
+    uint32_t getLength() const {
+      return PrologueLength + sizeof(TotalLength) + sizeof(Version) +
+             sizeof(PrologueLength);
+    }
+    // Length of the line table data in bytes (not including the prologue).
+    uint32_t getStatementTableLength() const {
+      return TotalLength + sizeof(TotalLength) - getLength();
+    }
+    int32_t getMaxLineIncrementForSpecialOpcode() const {
+      return LineBase + (int8_t)LineRange - 1;
+    }
+    void dump(raw_ostream &OS) const;
+    void clear() {
+      TotalLength = Version = PrologueLength = 0;
+      MinInstLength = LineBase = LineRange = OpcodeBase = 0;
+      StandardOpcodeLengths.clear();
+      IncludeDirectories.clear();
+      FileNames.clear();
+    }
+  };
+
+  // Standard .debug_line state machine structure.
+  struct Row {
+    Row(bool default_is_stmt = false) { reset(default_is_stmt); }
+    /// Called after a row is appended to the matrix.
+    void postAppend();
+    void reset(bool default_is_stmt);
+    void dump(raw_ostream &OS) const;
+
+    // The program-counter value corresponding to a machine instruction
+    // generated by the compiler.
+    uint64_t Address;
+    // An unsigned integer indicating a source line number. Lines are numbered
+    // beginning at 1. The compiler may emit the value 0 in cases where an
+    // instruction cannot be attributed to any source line.
+    uint32_t Line;
+    // An unsigned integer indicating a column number within a source line.
+    // Columns are numbered beginning at 1. The value 0 is reserved to indicate
+    // that a statement begins at the 'left edge' of the line.
+    uint16_t Column;
+    // An unsigned integer indicating the identity of the source file
+    // corresponding to a machine instruction.
+    uint16_t File;
+    // An unsigned integer whose value encodes the applicable instruction set
+    // architecture for the current instruction.
+    uint8_t Isa;
+    // A boolean indicating that the current instruction is the beginning of a
+    // statement.
+    uint8_t IsStmt:1,
+            // A boolean indicating that the current instruction is the
+            // beginning of a basic block.
+            BasicBlock:1,
+            // A boolean indicating that the current address is that of the
+            // first byte after the end of a sequence of target machine
+            // instructions.
+            EndSequence:1,
+            // A boolean indicating that the current address is one (of possibly
+            // many) where execution should be suspended for an entry breakpoint
+            // of a function.
+            PrologueEnd:1,
+            // A boolean indicating that the current address is one (of possibly
+            // many) where execution should be suspended for an exit breakpoint
+            // of a function.
+            EpilogueBegin:1;
+  };
+
+  struct LineTable {
+    void appendRow(const DWARFDebugLine::Row &state) { Rows.push_back(state); }
+    void clear() {
+      Prologue.clear();
+      Rows.clear();
+    }
+
+    uint32_t lookupAddress(uint64_t address, uint64_t cu_high_pc) const;
+    void dump(raw_ostream &OS) const;
+
+    struct Prologue Prologue;
+    std::vector<Row> Rows;
+  };
+
+  struct State : public Row, public LineTable {
+    // Special row codes.
+    enum {
+      StartParsingLineTable = 0,
+      DoneParsingLineTable = -1
+    };
+
+    State() : row(StartParsingLineTable) {}
+    virtual ~State();
+
+    virtual void appendRowToMatrix(uint32_t offset);
+    virtual void finalize(uint32_t offset) { row = DoneParsingLineTable; }
+    virtual void reset() { Row::reset(Prologue.DefaultIsStmt); }
+
+    // The row number that starts at zero for the prologue, and increases for
+    // each row added to the matrix.
+    unsigned row;
+  };
+
+  struct DumpingState : public State {
+    DumpingState(raw_ostream &OS) : OS(OS) {}
+    virtual ~DumpingState();
+    virtual void finalize(uint32_t offset);
+  private:
+    raw_ostream &OS;
+  };
+
+  static bool parsePrologue(DataExtractor debug_line_data, uint32_t *offset_ptr,
+                            Prologue *prologue);
+  /// Parse a single line table (prologue and all rows).
+  static bool parseStatementTable(DataExtractor debug_line_data,
+                                  uint32_t *offset_ptr, State &state);
+
+  const LineTable *getLineTable(uint32_t offset) const;
+  const LineTable *getOrParseLineTable(DataExtractor debug_line_data,
+                                       uint32_t offset);
+
+private:
+  typedef std::map<uint32_t, LineTable> LineTableMapTy;
+  typedef LineTableMapTy::iterator LineTableIter;
+  typedef LineTableMapTy::const_iterator LineTableConstIter;
+
+  LineTableMapTy LineTableMap;
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
new file mode 100644
index 0000000..705efe5
--- /dev/null
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -0,0 +1,427 @@
+//===-- DWARFFormValue.cpp ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFFormValue.h"
+#include "DWARFCompileUnit.h"
+#include "DWARFContext.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+using namespace llvm;
+using namespace dwarf;
+
+static const uint8_t form_sizes_addr4[] = {
+  0, // 0x00 unused
+  4, // 0x01 DW_FORM_addr
+  0, // 0x02 unused
+  0, // 0x03 DW_FORM_block2
+  0, // 0x04 DW_FORM_block4
+  2, // 0x05 DW_FORM_data2
+  4, // 0x06 DW_FORM_data4
+  8, // 0x07 DW_FORM_data8
+  0, // 0x08 DW_FORM_string
+  0, // 0x09 DW_FORM_block
+  0, // 0x0a DW_FORM_block1
+  1, // 0x0b DW_FORM_data1
+  1, // 0x0c DW_FORM_flag
+  0, // 0x0d DW_FORM_sdata
+  4, // 0x0e DW_FORM_strp
+  0, // 0x0f DW_FORM_udata
+  4, // 0x10 DW_FORM_ref_addr
+  1, // 0x11 DW_FORM_ref1
+  2, // 0x12 DW_FORM_ref2
+  4, // 0x13 DW_FORM_ref4
+  8, // 0x14 DW_FORM_ref8
+  0, // 0x15 DW_FORM_ref_udata
+  0, // 0x16 DW_FORM_indirect
+};
+
+static const uint8_t form_sizes_addr8[] = {
+  0, // 0x00 unused
+  8, // 0x01 DW_FORM_addr
+  0, // 0x02 unused
+  0, // 0x03 DW_FORM_block2
+  0, // 0x04 DW_FORM_block4
+  2, // 0x05 DW_FORM_data2
+  4, // 0x06 DW_FORM_data4
+  8, // 0x07 DW_FORM_data8
+  0, // 0x08 DW_FORM_string
+  0, // 0x09 DW_FORM_block
+  0, // 0x0a DW_FORM_block1
+  1, // 0x0b DW_FORM_data1
+  1, // 0x0c DW_FORM_flag
+  0, // 0x0d DW_FORM_sdata
+  4, // 0x0e DW_FORM_strp
+  0, // 0x0f DW_FORM_udata
+  8, // 0x10 DW_FORM_ref_addr
+  1, // 0x11 DW_FORM_ref1
+  2, // 0x12 DW_FORM_ref2
+  4, // 0x13 DW_FORM_ref4
+  8, // 0x14 DW_FORM_ref8
+  0, // 0x15 DW_FORM_ref_udata
+  0, // 0x16 DW_FORM_indirect
+};
+
+const uint8_t *
+DWARFFormValue::getFixedFormSizesForAddressSize(uint8_t addr_size) {
+  switch (addr_size) {
+  case 4: return form_sizes_addr4;
+  case 8: return form_sizes_addr8;
+  }
+  return NULL;
+}
+
+bool
+DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
+                             const DWARFCompileUnit *cu) {
+  bool indirect = false;
+  bool is_block = false;
+  Value.data = NULL;
+  // Read the value for the form into value and follow and DW_FORM_indirect
+  // instances we run into
+  do {
+    indirect = false;
+    switch (Form) {
+    case DW_FORM_addr:
+    case DW_FORM_ref_addr:
+      Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize());
+      break;
+    case DW_FORM_block:
+      Value.uval = data.getULEB128(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block1:
+      Value.uval = data.getU8(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block2:
+      Value.uval = data.getU16(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_block4:
+      Value.uval = data.getU32(offset_ptr);
+      is_block = true;
+      break;
+    case DW_FORM_data1:
+    case DW_FORM_ref1:
+    case DW_FORM_flag:
+      Value.uval = data.getU8(offset_ptr);
+      break;
+    case DW_FORM_data2:
+    case DW_FORM_ref2:
+      Value.uval = data.getU16(offset_ptr);
+      break;
+    case DW_FORM_data4:
+    case DW_FORM_ref4:
+      Value.uval = data.getU32(offset_ptr);
+      break;
+    case DW_FORM_data8:
+    case DW_FORM_ref8:
+      Value.uval = data.getU64(offset_ptr);
+      break;
+    case DW_FORM_sdata:
+      Value.sval = data.getSLEB128(offset_ptr);
+      break;
+    case DW_FORM_strp:
+      Value.uval = data.getU32(offset_ptr);
+      break;
+    case DW_FORM_udata:
+    case DW_FORM_ref_udata:
+      Value.uval = data.getULEB128(offset_ptr);
+      break;
+    case DW_FORM_string:
+      Value.cstr = data.getCStr(offset_ptr);
+      // Set the string value to also be the data for inlined cstr form
+      // values only so we can tell the differnence between DW_FORM_string
+      // and DW_FORM_strp form values
+      Value.data = (uint8_t*)Value.cstr;
+      break;
+    case DW_FORM_indirect:
+      Form = data.getULEB128(offset_ptr);
+      indirect = true;
+      break;
+    default:
+      return false;
+    }
+  } while (indirect);
+
+  if (is_block) {
+    StringRef str = data.getData().substr(*offset_ptr, Value.uval);
+    Value.data = NULL;
+    if (!str.empty()) {
+      Value.data = reinterpret_cast<const uint8_t *>(str.data());
+      *offset_ptr += Value.uval;
+    }
+  }
+
+  return true;
+}
+
+bool
+DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
+                          const DWARFCompileUnit *cu) const {
+  return DWARFFormValue::skipValue(Form, debug_info_data, offset_ptr, cu);
+}
+
+bool
+DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
+                          uint32_t *offset_ptr, const DWARFCompileUnit *cu) {
+  bool indirect = false;
+  do {
+    indirect = false;
+    switch (form) {
+    // Blocks if inlined data that have a length field and the data bytes
+    // inlined in the .debug_info
+    case DW_FORM_block: {
+      uint64_t size = debug_info_data.getULEB128(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block1: {
+      uint8_t size = debug_info_data.getU8(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block2: {
+      uint16_t size = debug_info_data.getU16(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+    case DW_FORM_block4: {
+      uint32_t size = debug_info_data.getU32(offset_ptr);
+      *offset_ptr += size;
+      return true;
+    }
+
+    // Inlined NULL terminated C-strings
+    case DW_FORM_string:
+      debug_info_data.getCStr(offset_ptr);
+      return true;
+
+    // Compile unit address sized values
+    case DW_FORM_addr:
+    case DW_FORM_ref_addr:
+      *offset_ptr += cu->getAddressByteSize();
+      return true;
+
+    // 1 byte values
+    case DW_FORM_data1:
+    case DW_FORM_flag:
+    case DW_FORM_ref1:
+      *offset_ptr += 1;
+      return true;
+
+    // 2 byte values
+    case DW_FORM_data2:
+    case DW_FORM_ref2:
+      *offset_ptr += 2;
+      return true;
+
+    // 4 byte values
+    case DW_FORM_strp:
+    case DW_FORM_data4:
+    case DW_FORM_ref4:
+      *offset_ptr += 4;
+      return true;
+
+    // 8 byte values
+    case DW_FORM_data8:
+    case DW_FORM_ref8:
+      *offset_ptr += 8;
+      return true;
+
+    // signed or unsigned LEB 128 values
+    //  case DW_FORM_APPLE_db_str:
+    case DW_FORM_sdata:
+    case DW_FORM_udata:
+    case DW_FORM_ref_udata:
+      debug_info_data.getULEB128(offset_ptr);
+      return true;
+
+    case DW_FORM_indirect:
+      indirect = true;
+      form = debug_info_data.getULEB128(offset_ptr);
+      break;
+    default:
+      return false;
+    }
+  } while (indirect);
+  return true;
+}
+
+void
+DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
+  DataExtractor debug_str_data(cu->getContext().getStringSection(), true, 0);
+  uint64_t uvalue = getUnsigned();
+  bool cu_relative_offset = false;
+
+  switch (Form) {
+  case DW_FORM_addr:      OS << format("0x%016x", uvalue); break;
+  case DW_FORM_flag:
+  case DW_FORM_data1:     OS << format("0x%02x", uvalue);  break;
+  case DW_FORM_data2:     OS << format("0x%04x", uvalue);  break;
+  case DW_FORM_data4:     OS << format("0x%08x", uvalue);  break;
+  case DW_FORM_data8:     OS << format("0x%016x", uvalue); break;
+  case DW_FORM_string:
+    OS << '"';
+    OS.write_escaped(getAsCString(NULL));
+    OS << '"';
+    break;
+  case DW_FORM_block:
+  case DW_FORM_block1:
+  case DW_FORM_block2:
+  case DW_FORM_block4:
+    if (uvalue > 0) {
+      switch (Form) {
+      case DW_FORM_block:  OS << format("<0x%llx> ", uvalue);            break;
+      case DW_FORM_block1: OS << format("<0x%2.2x> ", (uint8_t)uvalue);  break;
+      case DW_FORM_block2: OS << format("<0x%4.4x> ", (uint16_t)uvalue); break;
+      case DW_FORM_block4: OS << format("<0x%8.8x> ", (uint32_t)uvalue); break;
+      default: break;
+      }
+
+      const uint8_t* data_ptr = Value.data;
+      if (data_ptr) {
+        // uvalue contains size of block
+        const uint8_t* end_data_ptr = data_ptr + uvalue;
+        while (data_ptr < end_data_ptr) {
+          OS << format("%2.2x ", *data_ptr);
+          ++data_ptr;
+        }
+      }
+      else
+        OS << "NULL";
+    }
+    break;
+
+  case DW_FORM_sdata:     OS << getSigned();   break;
+  case DW_FORM_udata:     OS << getUnsigned(); break;
+  case DW_FORM_strp: {
+    OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
+    const char* dbg_str = getAsCString(&debug_str_data);
+    if (dbg_str) {
+      OS << '"';
+      OS.write_escaped(dbg_str);
+      OS << '"';
+    }
+    break;
+  }
+  case DW_FORM_ref_addr:
+    OS << format("0x%016x", uvalue);
+    break;
+  case DW_FORM_ref1:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%2.2x", (uint8_t)uvalue);
+    break;
+  case DW_FORM_ref2:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%4.4x", (uint16_t)uvalue);
+    break;
+  case DW_FORM_ref4:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%4.4x", (uint32_t)uvalue);
+    break;
+  case DW_FORM_ref8:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%8.8llx", uvalue);
+    break;
+  case DW_FORM_ref_udata:
+    cu_relative_offset = true;
+    OS << format("cu + 0x%llx", uvalue);
+    break;
+
+    // All DW_FORM_indirect attributes should be resolved prior to calling
+    // this function
+  case DW_FORM_indirect:
+    OS << "DW_FORM_indirect";
+    break;
+  default:
+    OS << format("DW_FORM(0x%4.4x)", Form);
+    break;
+  }
+
+  if (cu_relative_offset)
+    OS << format(" => {0x%8.8x}", (uvalue + (cu ? cu->getOffset() : 0)));
+}
+
+const char*
+DWARFFormValue::getAsCString(const DataExtractor *debug_str_data_ptr) const {
+  if (isInlinedCStr()) {
+    return Value.cstr;
+  } else if (debug_str_data_ptr) {
+    uint32_t offset = Value.uval;
+    return debug_str_data_ptr->getCStr(&offset);
+  }
+  return NULL;
+}
+
+uint64_t DWARFFormValue::getReference(const DWARFCompileUnit *cu) const {
+  uint64_t die_offset = Value.uval;
+  switch (Form) {
+  case DW_FORM_ref1:
+  case DW_FORM_ref2:
+  case DW_FORM_ref4:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_udata:
+      die_offset += (cu ? cu->getOffset() : 0);
+      break;
+  default:
+      break;
+  }
+
+  return die_offset;
+}
+
+bool
+DWARFFormValue::resolveCompileUnitReferences(const DWARFCompileUnit *cu) {
+  switch (Form) {
+  case DW_FORM_ref1:
+  case DW_FORM_ref2:
+  case DW_FORM_ref4:
+  case DW_FORM_ref8:
+  case DW_FORM_ref_udata:
+    Value.uval += cu->getOffset();
+    Form = DW_FORM_ref_addr;
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
+const uint8_t *DWARFFormValue::BlockData() const {
+  if (!isInlinedCStr())
+    return Value.data;
+  return NULL;
+}
+
+bool DWARFFormValue::isBlockForm(uint16_t form) {
+  switch (form) {
+  case DW_FORM_block:
+  case DW_FORM_block1:
+  case DW_FORM_block2:
+  case DW_FORM_block4:
+    return true;
+  }
+  return false;
+}
+
+bool DWARFFormValue::isDataForm(uint16_t form) {
+  switch (form) {
+  case DW_FORM_sdata:
+  case DW_FORM_udata:
+  case DW_FORM_data1:
+  case DW_FORM_data2:
+  case DW_FORM_data4:
+  case DW_FORM_data8:
+    return true;
+  }
+  return false;
+}
diff --git a/lib/DebugInfo/DWARFFormValue.h b/lib/DebugInfo/DWARFFormValue.h
new file mode 100644
index 0000000..22ac011
--- /dev/null
+++ b/lib/DebugInfo/DWARFFormValue.h
@@ -0,0 +1,78 @@
+//===-- DWARFFormValue.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFFORMVALUE_H
+#define LLVM_DEBUGINFO_DWARFFORMVALUE_H
+
+#include "llvm/Support/DataExtractor.h"
+
+namespace llvm {
+
+class DWARFCompileUnit;
+class raw_ostream;
+
+class DWARFFormValue {
+public:
+  struct ValueType {
+    ValueType() : data(NULL) {
+      uval = 0;
+    }
+
+    union {
+      uint64_t uval;
+      int64_t sval;
+      const char* cstr;
+    };
+    const uint8_t* data;
+  };
+
+  enum {
+    eValueTypeInvalid = 0,
+    eValueTypeUnsigned,
+    eValueTypeSigned,
+    eValueTypeCStr,
+    eValueTypeBlock
+  };
+
+private:
+  uint16_t Form;   // Form for this value.
+  ValueType Value; // Contains all data for the form.
+
+public:
+  DWARFFormValue(uint16_t form = 0) : Form(form) {}
+  uint16_t getForm() const { return Form; }
+  const ValueType& value() const { return Value; }
+  void dump(raw_ostream &OS, const DWARFCompileUnit* cu) const;
+  bool extractValue(DataExtractor data, uint32_t *offset_ptr,
+                    const DWARFCompileUnit *cu);
+  bool isInlinedCStr() const {
+    return Value.data != NULL && Value.data == (uint8_t*)Value.cstr;
+  }
+  const uint8_t *BlockData() const;
+  uint64_t getReference(const DWARFCompileUnit* cu) const;
+
+  /// Resolve any compile unit specific references so that we don't need
+  /// the compile unit at a later time in order to work with the form
+  /// value.
+  bool resolveCompileUnitReferences(const DWARFCompileUnit* cu);
+  uint64_t getUnsigned() const { return Value.uval; }
+  int64_t getSigned() const { return Value.sval; }
+  const char *getAsCString(const DataExtractor *debug_str_data_ptr) const;
+  bool skipValue(DataExtractor debug_info_data, uint32_t *offset_ptr,
+                 const DWARFCompileUnit *cu) const;
+  static bool skipValue(uint16_t form, DataExtractor debug_info_data,
+                        uint32_t *offset_ptr, const DWARFCompileUnit *cu);
+  static bool isBlockForm(uint16_t form);
+  static bool isDataForm(uint16_t form);
+  static const uint8_t *getFixedFormSizesForAddressSize(uint8_t addr_size);
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/Makefile b/lib/DebugInfo/Makefile
new file mode 100644
index 0000000..1292b57
--- /dev/null
+++ b/lib/DebugInfo/Makefile
@@ -0,0 +1,14 @@
+##===- lib/DebugInfo/Makefile ------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMDebugInfo
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index 58caae8..fb14d41 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -4,6 +4,13 @@ add_llvm_library(LLVMExecutionEngine
   TargetSelect.cpp
   )
 
+add_llvm_library_dependencies(LLVMExecutionEngine
+  LLVMCore
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(Interpreter)
 add_subdirectory(JIT)
 add_subdirectory(MCJIT)
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 7652090..525877b 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -93,7 +93,7 @@ public:
   /// \brief Returns the address the GlobalVariable should be written into.  The
   /// GVMemoryBlock object prefixes that.
   static char *Create(const GlobalVariable *GV, const TargetData& TD) {
-    const Type *ElTy = GV->getType()->getElementType();
+    Type *ElTy = GV->getType()->getElementType();
     size_t GVSize = (size_t)TD.getTypeAllocSize(ElTy);
     void *RawMemory = ::operator new(
       TargetData::RoundUpAlignment(sizeof(GVMemoryBlock),
@@ -272,7 +272,7 @@ void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
   Array = new char[(InputArgv.size()+1)*PtrSize];
 
   DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array << "\n");
-  const Type *SBytePtr = Type::getInt8PtrTy(C);
+  Type *SBytePtr = Type::getInt8PtrTy(C);
 
   for (unsigned i = 0; i != InputArgv.size(); ++i) {
     unsigned Size = InputArgv[i].size()+1;
@@ -361,8 +361,8 @@ int ExecutionEngine::runFunctionAsMain(Function *Fn,
 
   // Check main() type
   unsigned NumArgs = Fn->getFunctionType()->getNumParams();
-  const FunctionType *FTy = Fn->getFunctionType();
-  const Type* PPInt8Ty = Type::getInt8PtrTy(Fn->getContext())->getPointerTo();
+  FunctionType *FTy = Fn->getFunctionType();
+  Type* PPInt8Ty = Type::getInt8PtrTy(Fn->getContext())->getPointerTo();
 
   // Check the argument types.
   if (NumArgs > 3)
@@ -422,6 +422,7 @@ ExecutionEngine *ExecutionEngine::createJIT(Module *M,
                                             JITMemoryManager *JMM,
                                             CodeGenOpt::Level OptLevel,
                                             bool GVsWithCode,
+                                            Reloc::Model RM,
                                             CodeModel::Model CMM) {
   if (ExecutionEngine::JITCtor == 0) {
     if (ErrorStr)
@@ -436,9 +437,8 @@ ExecutionEngine *ExecutionEngine::createJIT(Module *M,
   SmallVector<std::string, 1> MAttrs;
 
   TargetMachine *TM =
-          EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr);
+    EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, RM, CMM, ErrorStr);
   if (!TM || (ErrorStr && ErrorStr->length() > 0)) return 0;
-  TM->setCodeModel(CMM);
 
   return ExecutionEngine::JITCtor(M, ErrorStr, JMM, OptLevel, GVsWithCode, TM);
 }
@@ -465,10 +465,9 @@ ExecutionEngine *EngineBuilder::create() {
   // Unless the interpreter was explicitly selected or the JIT is not linked,
   // try making a JIT.
   if (WhichEngine & EngineKind::JIT) {
-    if (TargetMachine *TM =
-        EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs, ErrorStr)) {
-      TM->setCodeModel(CMModel);
-
+    if (TargetMachine *TM = EngineBuilder::selectTarget(M, MArch, MCPU, MAttrs,
+                                                        RelocModel, CMModel,
+                                                        ErrorStr)) {
       if (UseMCJIT && ExecutionEngine::MCJITCtor) {
         ExecutionEngine *EE =
           ExecutionEngine::MCJITCtor(M, ErrorStr, JMM, OptLevel,
@@ -548,8 +547,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       // Compute the index
       GenericValue Result = getConstantValue(Op0);
       SmallVector<Value*, 8> Indices(CE->op_begin()+1, CE->op_end());
-      uint64_t Offset =
-        TD->getIndexedOffset(Op0->getType(), &Indices[0], Indices.size());
+      uint64_t Offset = TD->getIndexedOffset(Op0->getType(), Indices);
 
       char* tmp = (char*) Result.PointerVal;
       Result = PTOGV(tmp + Offset);
@@ -651,7 +649,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     }
     case Instruction::BitCast: {
       GenericValue GV = getConstantValue(Op0);
-      const Type* DestTy = CE->getType();
+      Type* DestTy = CE->getType();
       switch (Op0->getType()->getTypeID()) {
         default: llvm_unreachable("Invalid bitcast operand");
         case Type::IntegerTyID:
@@ -847,7 +845,7 @@ static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
 }
 
 void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
-                                         GenericValue *Ptr, const Type *Ty) {
+                                         GenericValue *Ptr, Type *Ty) {
   const unsigned StoreBytes = getTargetData()->getTypeStoreSize(Ty);
 
   switch (Ty->getTypeID()) {
@@ -909,7 +907,7 @@ static void LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) {
 ///
 void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
                                           GenericValue *Ptr,
-                                          const Type *Ty) {
+                                          Type *Ty) {
   const unsigned LoadBytes = getTargetData()->getTypeStoreSize(Ty);
 
   switch (Ty->getTypeID()) {
@@ -932,7 +930,7 @@ void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
     // FIXME: Will not trap if loading a signaling NaN.
     uint64_t y[2];
     memcpy(y, Ptr, 10);
-    Result.IntVal = APInt(80, 2, y);
+    Result.IntVal = APInt(80, y);
     break;
   }
   default:
@@ -986,7 +984,7 @@ void ExecutionEngine::emitGlobals() {
   // Loop over all of the global variables in the program, allocating the memory
   // to hold them.  If there is more than one module, do a prepass over globals
   // to figure out how the different modules should link together.
-  std::map<std::pair<std::string, const Type*>,
+  std::map<std::pair<std::string, Type*>,
            const GlobalValue*> LinkedGlobalsMap;
 
   if (Modules.size() != 1) {
@@ -1101,7 +1099,7 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
   if (!GV->isThreadLocal())
     InitializeMemory(GV->getInitializer(), GA);
 
-  const Type *ElTy = GV->getType()->getElementType();
+  Type *ElTy = GV->getType()->getElementType();
   size_t GVSize = (size_t)getTargetData()->getTypeAllocSize(ElTy);
   NumInitBytes += (unsigned)GVSize;
   ++NumGlobals;
diff --git a/lib/ExecutionEngine/Interpreter/CMakeLists.txt b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
index d331f83..4fb58c2 100644
--- a/lib/ExecutionEngine/Interpreter/CMakeLists.txt
+++ b/lib/ExecutionEngine/Interpreter/CMakeLists.txt
@@ -12,6 +12,14 @@ add_llvm_library(LLVMInterpreter
   Interpreter.cpp
   )
 
+add_llvm_library_dependencies(LLVMInterpreter
+  LLVMCodeGen
+  LLVMCore
+  LLVMExecutionEngine
+  LLVMSupport
+  LLVMTarget
+  )
+
 if( LLVM_ENABLE_FFI )
   target_link_libraries( LLVMInterpreter ${FFI_LIBRARY_PATH} )
 endif()
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 498063b..27917da 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -51,7 +51,7 @@ static void SetValue(Value *V, GenericValue Val, ExecutionContext &SF) {
      break
 
 static void executeFAddInst(GenericValue &Dest, GenericValue Src1,
-                            GenericValue Src2, const Type *Ty) {
+                            GenericValue Src2, Type *Ty) {
   switch (Ty->getTypeID()) {
     IMPLEMENT_BINARY_OPERATOR(+, Float);
     IMPLEMENT_BINARY_OPERATOR(+, Double);
@@ -62,7 +62,7 @@ static void executeFAddInst(GenericValue &Dest, GenericValue Src1,
 }
 
 static void executeFSubInst(GenericValue &Dest, GenericValue Src1,
-                            GenericValue Src2, const Type *Ty) {
+                            GenericValue Src2, Type *Ty) {
   switch (Ty->getTypeID()) {
     IMPLEMENT_BINARY_OPERATOR(-, Float);
     IMPLEMENT_BINARY_OPERATOR(-, Double);
@@ -73,7 +73,7 @@ static void executeFSubInst(GenericValue &Dest, GenericValue Src1,
 }
 
 static void executeFMulInst(GenericValue &Dest, GenericValue Src1,
-                            GenericValue Src2, const Type *Ty) {
+                            GenericValue Src2, Type *Ty) {
   switch (Ty->getTypeID()) {
     IMPLEMENT_BINARY_OPERATOR(*, Float);
     IMPLEMENT_BINARY_OPERATOR(*, Double);
@@ -84,7 +84,7 @@ static void executeFMulInst(GenericValue &Dest, GenericValue Src1,
 }
 
 static void executeFDivInst(GenericValue &Dest, GenericValue Src1, 
-                            GenericValue Src2, const Type *Ty) {
+                            GenericValue Src2, Type *Ty) {
   switch (Ty->getTypeID()) {
     IMPLEMENT_BINARY_OPERATOR(/, Float);
     IMPLEMENT_BINARY_OPERATOR(/, Double);
@@ -95,7 +95,7 @@ static void executeFDivInst(GenericValue &Dest, GenericValue Src1,
 }
 
 static void executeFRemInst(GenericValue &Dest, GenericValue Src1, 
-                            GenericValue Src2, const Type *Ty) {
+                            GenericValue Src2, Type *Ty) {
   switch (Ty->getTypeID()) {
   case Type::FloatTyID:
     Dest.FloatVal = fmod(Src1.FloatVal, Src2.FloatVal);
@@ -125,7 +125,7 @@ static void executeFRemInst(GenericValue &Dest, GenericValue Src1,
       break;
 
 static GenericValue executeICMP_EQ(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(eq,Ty);
@@ -138,7 +138,7 @@ static GenericValue executeICMP_EQ(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_NE(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(ne,Ty);
@@ -151,7 +151,7 @@ static GenericValue executeICMP_NE(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_ULT(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(ult,Ty);
@@ -164,7 +164,7 @@ static GenericValue executeICMP_ULT(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_SLT(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(slt,Ty);
@@ -177,7 +177,7 @@ static GenericValue executeICMP_SLT(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_UGT(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(ugt,Ty);
@@ -190,7 +190,7 @@ static GenericValue executeICMP_UGT(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_SGT(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(sgt,Ty);
@@ -203,7 +203,7 @@ static GenericValue executeICMP_SGT(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_ULE(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(ule,Ty);
@@ -216,7 +216,7 @@ static GenericValue executeICMP_ULE(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_SLE(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(sle,Ty);
@@ -229,7 +229,7 @@ static GenericValue executeICMP_SLE(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_UGE(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(uge,Ty);
@@ -242,7 +242,7 @@ static GenericValue executeICMP_UGE(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeICMP_SGE(GenericValue Src1, GenericValue Src2,
-                                    const Type *Ty) {
+                                    Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_INTEGER_ICMP(sge,Ty);
@@ -256,7 +256,7 @@ static GenericValue executeICMP_SGE(GenericValue Src1, GenericValue Src2,
 
 void Interpreter::visitICmpInst(ICmpInst &I) {
   ExecutionContext &SF = ECStack.back();
-  const Type *Ty    = I.getOperand(0)->getType();
+  Type *Ty    = I.getOperand(0)->getType();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue R;   // Result
@@ -286,7 +286,7 @@ void Interpreter::visitICmpInst(ICmpInst &I) {
      break
 
 static GenericValue executeFCMP_OEQ(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_FCMP(==, Float);
@@ -299,7 +299,7 @@ static GenericValue executeFCMP_OEQ(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeFCMP_ONE(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_FCMP(!=, Float);
@@ -313,7 +313,7 @@ static GenericValue executeFCMP_ONE(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeFCMP_OLE(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_FCMP(<=, Float);
@@ -326,7 +326,7 @@ static GenericValue executeFCMP_OLE(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeFCMP_OGE(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_FCMP(>=, Float);
@@ -339,7 +339,7 @@ static GenericValue executeFCMP_OGE(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeFCMP_OLT(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_FCMP(<, Float);
@@ -352,7 +352,7 @@ static GenericValue executeFCMP_OLT(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeFCMP_OGT(GenericValue Src1, GenericValue Src2,
-                                     const Type *Ty) {
+                                     Type *Ty) {
   GenericValue Dest;
   switch (Ty->getTypeID()) {
     IMPLEMENT_FCMP(>, Float);
@@ -377,49 +377,49 @@ static GenericValue executeFCMP_OGT(GenericValue Src1, GenericValue Src2,
 
 
 static GenericValue executeFCMP_UEQ(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   IMPLEMENT_UNORDERED(Ty, Src1, Src2)
   return executeFCMP_OEQ(Src1, Src2, Ty);
 }
 
 static GenericValue executeFCMP_UNE(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   IMPLEMENT_UNORDERED(Ty, Src1, Src2)
   return executeFCMP_ONE(Src1, Src2, Ty);
 }
 
 static GenericValue executeFCMP_ULE(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   IMPLEMENT_UNORDERED(Ty, Src1, Src2)
   return executeFCMP_OLE(Src1, Src2, Ty);
 }
 
 static GenericValue executeFCMP_UGE(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   IMPLEMENT_UNORDERED(Ty, Src1, Src2)
   return executeFCMP_OGE(Src1, Src2, Ty);
 }
 
 static GenericValue executeFCMP_ULT(GenericValue Src1, GenericValue Src2,
-                                   const Type *Ty) {
+                                   Type *Ty) {
   GenericValue Dest;
   IMPLEMENT_UNORDERED(Ty, Src1, Src2)
   return executeFCMP_OLT(Src1, Src2, Ty);
 }
 
 static GenericValue executeFCMP_UGT(GenericValue Src1, GenericValue Src2,
-                                     const Type *Ty) {
+                                     Type *Ty) {
   GenericValue Dest;
   IMPLEMENT_UNORDERED(Ty, Src1, Src2)
   return executeFCMP_OGT(Src1, Src2, Ty);
 }
 
 static GenericValue executeFCMP_ORD(GenericValue Src1, GenericValue Src2,
-                                     const Type *Ty) {
+                                     Type *Ty) {
   GenericValue Dest;
   if (Ty->isFloatTy())
     Dest.IntVal = APInt(1,(Src1.FloatVal == Src1.FloatVal && 
@@ -431,7 +431,7 @@ static GenericValue executeFCMP_ORD(GenericValue Src1, GenericValue Src2,
 }
 
 static GenericValue executeFCMP_UNO(GenericValue Src1, GenericValue Src2,
-                                     const Type *Ty) {
+                                     Type *Ty) {
   GenericValue Dest;
   if (Ty->isFloatTy())
     Dest.IntVal = APInt(1,(Src1.FloatVal != Src1.FloatVal || 
@@ -444,7 +444,7 @@ static GenericValue executeFCMP_UNO(GenericValue Src1, GenericValue Src2,
 
 void Interpreter::visitFCmpInst(FCmpInst &I) {
   ExecutionContext &SF = ECStack.back();
-  const Type *Ty    = I.getOperand(0)->getType();
+  Type *Ty    = I.getOperand(0)->getType();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue R;   // Result
@@ -475,7 +475,7 @@ void Interpreter::visitFCmpInst(FCmpInst &I) {
 }
 
 static GenericValue executeCmpInst(unsigned predicate, GenericValue Src1, 
-                                   GenericValue Src2, const Type *Ty) {
+                                   GenericValue Src2, Type *Ty) {
   GenericValue Result;
   switch (predicate) {
   case ICmpInst::ICMP_EQ:    return executeICMP_EQ(Src1, Src2, Ty);
@@ -520,7 +520,7 @@ static GenericValue executeCmpInst(unsigned predicate, GenericValue Src1,
 
 void Interpreter::visitBinaryOperator(BinaryOperator &I) {
   ExecutionContext &SF = ECStack.back();
-  const Type *Ty    = I.getOperand(0)->getType();
+  Type *Ty    = I.getOperand(0)->getType();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue R;   // Result
@@ -585,7 +585,7 @@ void Interpreter::exitCalled(GenericValue GV) {
 /// care of switching to the normal destination BB, if we are returning
 /// from an invoke.
 ///
-void Interpreter::popStackAndReturnValueToCaller(const Type *RetTy,
+void Interpreter::popStackAndReturnValueToCaller(Type *RetTy,
                                                  GenericValue Result) {
   // Pop the current stack frame.
   ECStack.pop_back();
@@ -613,7 +613,7 @@ void Interpreter::popStackAndReturnValueToCaller(const Type *RetTy,
 
 void Interpreter::visitReturnInst(ReturnInst &I) {
   ExecutionContext &SF = ECStack.back();
-  const Type *RetTy = Type::getVoidTy(I.getContext());
+  Type *RetTy = Type::getVoidTy(I.getContext());
   GenericValue Result;
 
   // Save away the return value... (if we are not 'ret void')
@@ -662,18 +662,21 @@ void Interpreter::visitBranchInst(BranchInst &I) {
 
 void Interpreter::visitSwitchInst(SwitchInst &I) {
   ExecutionContext &SF = ECStack.back();
-  GenericValue CondVal = getOperandValue(I.getOperand(0), SF);
-  const Type *ElTy = I.getOperand(0)->getType();
+  Value* Cond = I.getCondition();
+  Type *ElTy = Cond->getType();
+  GenericValue CondVal = getOperandValue(Cond, SF);
 
   // Check to see if any of the cases match...
   BasicBlock *Dest = 0;
-  for (unsigned i = 2, e = I.getNumOperands(); i != e; i += 2)
-    if (executeICMP_EQ(CondVal, getOperandValue(I.getOperand(i), SF), ElTy)
-        .IntVal != 0) {
-      Dest = cast<BasicBlock>(I.getOperand(i+1));
+  unsigned NumCases = I.getNumCases();
+  // Skip the first item since that's the default case.
+  for (unsigned i = 1; i < NumCases; ++i) {
+    GenericValue CaseVal = getOperandValue(I.getCaseValue(i), SF);
+    if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
+      Dest = cast<BasicBlock>(I.getSuccessor(i));
       break;
     }
-
+  }
   if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
   SwitchToNewBasicBlock(Dest, SF);
 }
@@ -730,7 +733,7 @@ void Interpreter::SwitchToNewBasicBlock(BasicBlock *Dest, ExecutionContext &SF){
 void Interpreter::visitAllocaInst(AllocaInst &I) {
   ExecutionContext &SF = ECStack.back();
 
-  const Type *Ty = I.getType()->getElementType();  // Type to be allocated
+  Type *Ty = I.getType()->getElementType();  // Type to be allocated
 
   // Get the number of elements being allocated by the array...
   unsigned NumElements = 
@@ -767,7 +770,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
   uint64_t Total = 0;
 
   for (; I != E; ++I) {
-    if (const StructType *STy = dyn_cast<StructType>(*I)) {
+    if (StructType *STy = dyn_cast<StructType>(*I)) {
       const StructLayout *SLO = TD.getStructLayout(STy);
 
       const ConstantInt *CPU = cast<ConstantInt>(I.getOperand());
@@ -775,7 +778,7 @@ GenericValue Interpreter::executeGEPOperation(Value *Ptr, gep_type_iterator I,
 
       Total += SLO->getElementOffset(Index);
     } else {
-      const SequentialType *ST = cast<SequentialType>(*I);
+      SequentialType *ST = cast<SequentialType>(*I);
       // Get the index number for the array... which must be long type...
       GenericValue IdxGV = getOperandValue(I.getOperand(), SF);
 
@@ -929,34 +932,34 @@ void Interpreter::visitAShr(BinaryOperator &I) {
   SetValue(&I, Dest, SF);
 }
 
-GenericValue Interpreter::executeTruncInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeTruncInst(Value *SrcVal, Type *DstTy,
                                            ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  const IntegerType *DITy = cast<IntegerType>(DstTy);
+  IntegerType *DITy = cast<IntegerType>(DstTy);
   unsigned DBitWidth = DITy->getBitWidth();
   Dest.IntVal = Src.IntVal.trunc(DBitWidth);
   return Dest;
 }
 
-GenericValue Interpreter::executeSExtInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy,
                                           ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  const IntegerType *DITy = cast<IntegerType>(DstTy);
+  IntegerType *DITy = cast<IntegerType>(DstTy);
   unsigned DBitWidth = DITy->getBitWidth();
   Dest.IntVal = Src.IntVal.sext(DBitWidth);
   return Dest;
 }
 
-GenericValue Interpreter::executeZExtInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeZExtInst(Value *SrcVal, Type *DstTy,
                                           ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  const IntegerType *DITy = cast<IntegerType>(DstTy);
+  IntegerType *DITy = cast<IntegerType>(DstTy);
   unsigned DBitWidth = DITy->getBitWidth();
   Dest.IntVal = Src.IntVal.zext(DBitWidth);
   return Dest;
 }
 
-GenericValue Interpreter::executeFPTruncInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeFPTruncInst(Value *SrcVal, Type *DstTy,
                                              ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(SrcVal->getType()->isDoubleTy() && DstTy->isFloatTy() &&
@@ -965,7 +968,7 @@ GenericValue Interpreter::executeFPTruncInst(Value *SrcVal, const Type *DstTy,
   return Dest;
 }
 
-GenericValue Interpreter::executeFPExtInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeFPExtInst(Value *SrcVal, Type *DstTy,
                                            ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(SrcVal->getType()->isFloatTy() && DstTy->isDoubleTy() &&
@@ -974,9 +977,9 @@ GenericValue Interpreter::executeFPExtInst(Value *SrcVal, const Type *DstTy,
   return Dest;
 }
 
-GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
-  const Type *SrcTy = SrcVal->getType();
+  Type *SrcTy = SrcVal->getType();
   uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(SrcTy->isFloatingPointTy() && "Invalid FPToUI instruction");
@@ -988,9 +991,9 @@ GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, const Type *DstTy,
   return Dest;
 }
 
-GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
-  const Type *SrcTy = SrcVal->getType();
+  Type *SrcTy = SrcVal->getType();
   uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(SrcTy->isFloatingPointTy() && "Invalid FPToSI instruction");
@@ -1002,7 +1005,7 @@ GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, const Type *DstTy,
   return Dest;
 }
 
-GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(DstTy->isFloatingPointTy() && "Invalid UIToFP instruction");
@@ -1014,7 +1017,7 @@ GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, const Type *DstTy,
   return Dest;
 }
 
-GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(DstTy->isFloatingPointTy() && "Invalid SIToFP instruction");
@@ -1027,7 +1030,7 @@ GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, const Type *DstTy,
 
 }
 
-GenericValue Interpreter::executePtrToIntInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executePtrToIntInst(Value *SrcVal, Type *DstTy,
                                               ExecutionContext &SF) {
   uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
@@ -1037,7 +1040,7 @@ GenericValue Interpreter::executePtrToIntInst(Value *SrcVal, const Type *DstTy,
   return Dest;
 }
 
-GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, Type *DstTy,
                                               ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   assert(DstTy->isPointerTy() && "Invalid PtrToInt instruction");
@@ -1050,10 +1053,10 @@ GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, const Type *DstTy,
   return Dest;
 }
 
-GenericValue Interpreter::executeBitCastInst(Value *SrcVal, const Type *DstTy,
+GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
                                              ExecutionContext &SF) {
   
-  const Type *SrcTy = SrcVal->getType();
+  Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
   if (DstTy->isPointerTy()) {
     assert(SrcTy->isPointerTy() && "Invalid BitCast");
@@ -1155,7 +1158,7 @@ void Interpreter::visitVAArgInst(VAArgInst &I) {
   GenericValue Dest;
   GenericValue Src = ECStack[VAList.UIntPairVal.first]
                       .VarArgs[VAList.UIntPairVal.second];
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
   switch (Ty->getTypeID()) {
     case Type::IntegerTyID: Dest.IntVal = Src.IntVal;
     IMPLEMENT_VAARG(Pointer);
@@ -1222,7 +1225,7 @@ GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
   GenericValue Op0 = getOperandValue(CE->getOperand(0), SF);
   GenericValue Op1 = getOperandValue(CE->getOperand(1), SF);
   GenericValue Dest;
-  const Type * Ty = CE->getOperand(0)->getType();
+  Type * Ty = CE->getOperand(0)->getType();
   switch (CE->getOpcode()) {
   case Instruction::Add:  Dest.IntVal = Op0.IntVal + Op1.IntVal; break;
   case Instruction::Sub:  Dest.IntVal = Op0.IntVal - Op1.IntVal; break;
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index f7e2a4d..055875c 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -48,7 +48,7 @@ using namespace llvm;
 
 static ManagedStatic<sys::Mutex> FunctionsLock;
 
-typedef GenericValue (*ExFunc)(const FunctionType *,
+typedef GenericValue (*ExFunc)(FunctionType *,
                                const std::vector<GenericValue> &);
 static ManagedStatic<std::map<const Function *, ExFunc> > ExportedFunctions;
 static std::map<std::string, ExFunc> FuncNames;
@@ -60,7 +60,7 @@ static ManagedStatic<std::map<const Function *, RawFunc> > RawFunctions;
 
 static Interpreter *TheInterpreter;
 
-static char getTypeID(const Type *Ty) {
+static char getTypeID(Type *Ty) {
   switch (Ty->getTypeID()) {
   case Type::VoidTyID:    return 'V';
   case Type::IntegerTyID:
@@ -91,7 +91,7 @@ static ExFunc lookupFunction(const Function *F) {
   // Function not found, look it up... start by figuring out what the
   // composite function name should be.
   std::string ExtName = "lle_";
-  const FunctionType *FT = F->getFunctionType();
+  FunctionType *FT = F->getFunctionType();
   for (unsigned i = 0, e = FT->getNumContainedTypes(); i != e; ++i)
     ExtName += getTypeID(FT->getContainedType(i));
   ExtName + "_" + F->getNameStr();
@@ -109,7 +109,7 @@ static ExFunc lookupFunction(const Function *F) {
 }
 
 #ifdef USE_LIBFFI
-static ffi_type *ffiTypeFor(const Type *Ty) {
+static ffi_type *ffiTypeFor(Type *Ty) {
   switch (Ty->getTypeID()) {
     case Type::VoidTyID: return &ffi_type_void;
     case Type::IntegerTyID:
@@ -129,7 +129,7 @@ static ffi_type *ffiTypeFor(const Type *Ty) {
   return NULL;
 }
 
-static void *ffiValueFor(const Type *Ty, const GenericValue &AV,
+static void *ffiValueFor(Type *Ty, const GenericValue &AV,
                          void *ArgDataPtr) {
   switch (Ty->getTypeID()) {
     case Type::IntegerTyID:
@@ -181,7 +181,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F,
                       const std::vector<GenericValue> &ArgVals,
                       const TargetData *TD, GenericValue &Result) {
   ffi_cif cif;
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
   const unsigned NumArgs = F->arg_size();
 
   // TODO: We don't have type information about the remaining arguments, because
@@ -197,7 +197,7 @@ static bool ffiInvoke(RawFunc Fn, Function *F,
   for (Function::const_arg_iterator A = F->arg_begin(), E = F->arg_end();
        A != E; ++A) {
     const unsigned ArgNo = A->getArgNo();
-    const Type *ArgTy = FTy->getParamType(ArgNo);
+    Type *ArgTy = FTy->getParamType(ArgNo);
     args[ArgNo] = ffiTypeFor(ArgTy);
     ArgBytes += TD->getTypeStoreSize(ArgTy);
   }
@@ -209,12 +209,12 @@ static bool ffiInvoke(RawFunc Fn, Function *F,
   for (Function::const_arg_iterator A = F->arg_begin(), E = F->arg_end();
        A != E; ++A) {
     const unsigned ArgNo = A->getArgNo();
-    const Type *ArgTy = FTy->getParamType(ArgNo);
+    Type *ArgTy = FTy->getParamType(ArgNo);
     values[ArgNo] = ffiValueFor(ArgTy, ArgVals[ArgNo], ArgDataPtr);
     ArgDataPtr += TD->getTypeStoreSize(ArgTy);
   }
 
-  const Type *RetTy = FTy->getReturnType();
+  Type *RetTy = FTy->getReturnType();
   ffi_type *rtype = ffiTypeFor(RetTy);
 
   if (ffi_prep_cif(&cif, FFI_DEFAULT_ABI, NumArgs, rtype, &args[0]) == FFI_OK) {
@@ -304,7 +304,7 @@ GenericValue Interpreter::callExternalFunction(Function *F,
 extern "C" {  // Don't add C++ manglings to llvm mangling :)
 
 // void atexit(Function*)
-GenericValue lle_X_atexit(const FunctionType *FT,
+GenericValue lle_X_atexit(FunctionType *FT,
                           const std::vector<GenericValue> &Args) {
   assert(Args.size() == 1);
   TheInterpreter->addAtExitHandler((Function*)GVTOP(Args[0]));
@@ -314,14 +314,14 @@ GenericValue lle_X_atexit(const FunctionType *FT,
 }
 
 // void exit(int)
-GenericValue lle_X_exit(const FunctionType *FT,
+GenericValue lle_X_exit(FunctionType *FT,
                         const std::vector<GenericValue> &Args) {
   TheInterpreter->exitCalled(Args[0]);
   return GenericValue();
 }
 
 // void abort(void)
-GenericValue lle_X_abort(const FunctionType *FT,
+GenericValue lle_X_abort(FunctionType *FT,
                          const std::vector<GenericValue> &Args) {
   //FIXME: should we report or raise here?
   //report_fatal_error("Interpreted program raised SIGABRT");
@@ -331,7 +331,7 @@ GenericValue lle_X_abort(const FunctionType *FT,
 
 // int sprintf(char *, const char *, ...) - a very rough implementation to make
 // output useful.
-GenericValue lle_X_sprintf(const FunctionType *FT,
+GenericValue lle_X_sprintf(FunctionType *FT,
                            const std::vector<GenericValue> &Args) {
   char *OutputBuffer = (char *)GVTOP(Args[0]);
   const char *FmtStr = (const char *)GVTOP(Args[1]);
@@ -413,7 +413,7 @@ GenericValue lle_X_sprintf(const FunctionType *FT,
 
 // int printf(const char *, ...) - a very rough implementation to make output
 // useful.
-GenericValue lle_X_printf(const FunctionType *FT,
+GenericValue lle_X_printf(FunctionType *FT,
                           const std::vector<GenericValue> &Args) {
   char Buffer[10000];
   std::vector<GenericValue> NewArgs;
@@ -425,7 +425,7 @@ GenericValue lle_X_printf(const FunctionType *FT,
 }
 
 // int sscanf(const char *format, ...);
-GenericValue lle_X_sscanf(const FunctionType *FT,
+GenericValue lle_X_sscanf(FunctionType *FT,
                           const std::vector<GenericValue> &args) {
   assert(args.size() < 10 && "Only handle up to 10 args to sscanf right now!");
 
@@ -440,7 +440,7 @@ GenericValue lle_X_sscanf(const FunctionType *FT,
 }
 
 // int scanf(const char *format, ...);
-GenericValue lle_X_scanf(const FunctionType *FT,
+GenericValue lle_X_scanf(FunctionType *FT,
                          const std::vector<GenericValue> &args) {
   assert(args.size() < 10 && "Only handle up to 10 args to scanf right now!");
 
@@ -456,7 +456,7 @@ GenericValue lle_X_scanf(const FunctionType *FT,
 
 // int fprintf(FILE *, const char *, ...) - a very rough implementation to make
 // output useful.
-GenericValue lle_X_fprintf(const FunctionType *FT,
+GenericValue lle_X_fprintf(FunctionType *FT,
                            const std::vector<GenericValue> &Args) {
   assert(Args.size() >= 2);
   char Buffer[10000];
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
index bfebe3d..ee2b459 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -174,7 +174,7 @@ public:
 
   void visitVAArgInst(VAArgInst &I);
   void visitInstruction(Instruction &I) {
-    errs() << I;
+    errs() << I << "\n";
     llvm_unreachable("Instruction not interpretable yet!");
   }
 
@@ -207,33 +207,33 @@ private:  // Helper functions
   void initializeExternalFunctions();
   GenericValue getConstantExprValue(ConstantExpr *CE, ExecutionContext &SF);
   GenericValue getOperandValue(Value *V, ExecutionContext &SF);
-  GenericValue executeTruncInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeTruncInst(Value *SrcVal, Type *DstTy,
                                 ExecutionContext &SF);
-  GenericValue executeSExtInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeSExtInst(Value *SrcVal, Type *DstTy,
                                ExecutionContext &SF);
-  GenericValue executeZExtInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeZExtInst(Value *SrcVal, Type *DstTy,
                                ExecutionContext &SF);
-  GenericValue executeFPTruncInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeFPTruncInst(Value *SrcVal, Type *DstTy,
                                   ExecutionContext &SF);
-  GenericValue executeFPExtInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeFPExtInst(Value *SrcVal, Type *DstTy,
                                 ExecutionContext &SF);
-  GenericValue executeFPToUIInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeFPToUIInst(Value *SrcVal, Type *DstTy,
                                  ExecutionContext &SF);
-  GenericValue executeFPToSIInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeFPToSIInst(Value *SrcVal, Type *DstTy,
                                  ExecutionContext &SF);
-  GenericValue executeUIToFPInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeUIToFPInst(Value *SrcVal, Type *DstTy,
                                  ExecutionContext &SF);
-  GenericValue executeSIToFPInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeSIToFPInst(Value *SrcVal, Type *DstTy,
                                  ExecutionContext &SF);
-  GenericValue executePtrToIntInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executePtrToIntInst(Value *SrcVal, Type *DstTy,
                                    ExecutionContext &SF);
-  GenericValue executeIntToPtrInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeIntToPtrInst(Value *SrcVal, Type *DstTy,
                                    ExecutionContext &SF);
-  GenericValue executeBitCastInst(Value *SrcVal, const Type *DstTy,
+  GenericValue executeBitCastInst(Value *SrcVal, Type *DstTy,
                                   ExecutionContext &SF);
   GenericValue executeCastOperation(Instruction::CastOps opcode, Value *SrcVal, 
-                                    const Type *Ty, ExecutionContext &SF);
-  void popStackAndReturnValueToCaller(const Type *RetTy, GenericValue Result);
+                                    Type *Ty, ExecutionContext &SF);
+  void popStackAndReturnValueToCaller(Type *RetTy, GenericValue Result);
 
 };
 
diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
index cefb0ae..598e50e 100644
--- a/lib/ExecutionEngine/JIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/JIT/CMakeLists.txt
@@ -10,3 +10,11 @@ add_llvm_library(LLVMJIT
   JITMemoryManager.cpp
   OProfileJITEventListener.cpp
   )
+
+add_llvm_library_dependencies(LLVMJIT
+  LLVMCore
+  LLVMExecutionEngine
+  LLVMRuntimeDyld
+  LLVMSupport
+  LLVMTarget
+  )
diff --git a/lib/ExecutionEngine/JIT/Intercept.cpp b/lib/ExecutionEngine/JIT/Intercept.cpp
index fa8bee4..2251a8e 100644
--- a/lib/ExecutionEngine/JIT/Intercept.cpp
+++ b/lib/ExecutionEngine/JIT/Intercept.cpp
@@ -52,6 +52,7 @@ static void runAtExitHandlers() {
 #include <sys/stat.h>
 #endif
 #include <fcntl.h>
+#include <unistd.h>
 /* stat functions are redirecting to __xstat with a version number.  On x86-64
  * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
  * available as an exported symbol, so we have to add it explicitly.
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 445d2d0..d773009 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -390,8 +390,8 @@ GenericValue JIT::runFunction(Function *F,
 
   void *FPtr = getPointerToFunction(F);
   assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
-  const FunctionType *FTy = F->getFunctionType();
-  const Type *RetTy = FTy->getReturnType();
+  FunctionType *FTy = F->getFunctionType();
+  Type *RetTy = FTy->getReturnType();
 
   assert((FTy->getNumParams() == ArgValues.size() ||
           (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
@@ -500,7 +500,7 @@ GenericValue JIT::runFunction(Function *F,
   SmallVector<Value*, 8> Args;
   for (unsigned i = 0, e = ArgValues.size(); i != e; ++i) {
     Constant *C = 0;
-    const Type *ArgTy = FTy->getParamType(i);
+    Type *ArgTy = FTy->getParamType(i);
     const GenericValue &AV = ArgValues[i];
     switch (ArgTy->getTypeID()) {
     default: llvm_unreachable("Unknown argument type for function call!");
@@ -788,7 +788,7 @@ char* JIT::getMemoryForGV(const GlobalVariable* GV) {
   // be allocated into the same buffer, but in general globals are allocated
   // through the memory manager which puts them near the code but not in the
   // same buffer.
-  const Type *GlobalType = GV->getType()->getElementType();
+  Type *GlobalType = GV->getType()->getElementType();
   size_t S = getTargetData()->getTypeAllocSize(GlobalType);
   size_t A = getTargetData()->getPreferredAlignment(GV);
   if (GV->isThreadLocal()) {
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index b879fc3..92dcb0e 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -100,9 +100,10 @@ public:
                                  CodeGenOpt::Level OptLevel =
                                    CodeGenOpt::Default,
                                  bool GVsWithCode = true,
-                                 CodeModel::Model CMM = CodeModel::Default) {
+                                 Reloc::Model RM = Reloc::Default,
+                                 CodeModel::Model CMM = CodeModel::JITDefault) {
     return ExecutionEngine::createJIT(M, Err, JMM, OptLevel, GVsWithCode,
-                                      CMM);
+                                      RM, CMM);
   }
 
   virtual void addModule(Module *M);
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
index ddb0d54..8f84ac7 100644
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
@@ -18,12 +18,12 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/JITCodeEmitter.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -45,7 +45,7 @@ unsigned char* JITDwarfEmitter::EmitDwarfTable(MachineFunction& F,
   TD = TM.getTargetData();
   stackGrowthDirection = TM.getFrameLowering()->getStackGrowthDirection();
   RI = TM.getRegisterInfo();
-  TFI = TM.getFrameLowering();
+  MAI = TM.getMCAsmInfo();
   JCE = &jce;
 
   unsigned char* ExceptionTable = EmitExceptionTable(&F, StartFunction,
@@ -523,9 +523,7 @@ JITDwarfEmitter::EmitCommonEHFrame(const Function* Personality) const {
     JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
   }
 
-  std::vector<MachineMove> Moves;
-  TFI->getInitialFrameState(Moves);
-  EmitFrameMoves(0, Moves);
+  EmitFrameMoves(0, MAI->getInitialFrameState());
 
   JCE->emitAlignmentWithFill(PointerSize, dwarf::DW_CFA_nop);
 
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
index e1d0045..8dc99ab 100644
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
@@ -22,8 +22,8 @@ class JITCodeEmitter;
 class MachineFunction;
 class MachineModuleInfo;
 class MachineMove;
+class MCAsmInfo;
 class TargetData;
-class TargetFrameLowering;
 class TargetMachine;
 class TargetRegisterInfo;
 
@@ -31,7 +31,7 @@ class JITDwarfEmitter {
   const TargetData* TD;
   JITCodeEmitter* JCE;
   const TargetRegisterInfo* RI;
-  const TargetFrameLowering *TFI;
+  const MCAsmInfo *MAI;
   MachineModuleInfo* MMI;
   JIT& Jit;
   bool stackGrowthDirection;
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index d046b8a..24020ee 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -668,6 +668,7 @@ void *JITResolver::JITCompilerFn(void *Stub) {
     DEBUG(dbgs() << "JIT: Lazily resolving function '" << F->getName()
           << "' In stub ptr = " << Stub << " actual ptr = "
           << ActualPtr << "\n");
+    (void)ActualPtr;
 
     Result = JR->TheJIT->getPointerToFunction(F);
   }
@@ -770,7 +771,7 @@ static unsigned GetConstantPoolSizeInBytes(MachineConstantPool *MCP,
     MachineConstantPoolEntry CPE = Constants[i];
     unsigned AlignMask = CPE.getAlignment() - 1;
     Size = (Size + AlignMask) & ~AlignMask;
-    const Type *Ty = CPE.getType();
+    Type *Ty = CPE.getType();
     Size += TD->getTypeAllocSize(Ty);
   }
   return Size;
@@ -1098,7 +1099,7 @@ void JITEmitter::emitConstantPool(MachineConstantPool *MCP) {
     DEBUG(dbgs() << "JIT:   CP" << i << " at [0x";
           dbgs().write_hex(CAddr) << "]\n");
 
-    const Type *Ty = CPE.Val.ConstVal->getType();
+    Type *Ty = CPE.Val.ConstVal->getType();
     Offset += TheJIT->getTargetData()->getTypeAllocSize(Ty);
   }
 }
diff --git a/lib/ExecutionEngine/MCJIT/CMakeLists.txt b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
index 38fdffa..aae8a1b 100644
--- a/lib/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -2,3 +2,11 @@ add_llvm_library(LLVMMCJIT
   MCJIT.cpp
   Intercept.cpp
   )
+
+add_llvm_library_dependencies(LLVMMCJIT
+  LLVMCore
+  LLVMExecutionEngine
+  LLVMRuntimeDyld
+  LLVMSupport
+  LLVMTarget
+  )
diff --git a/lib/ExecutionEngine/MCJIT/Intercept.cpp b/lib/ExecutionEngine/MCJIT/Intercept.cpp
index e431c84..f83f428 100644
--- a/lib/ExecutionEngine/MCJIT/Intercept.cpp
+++ b/lib/ExecutionEngine/MCJIT/Intercept.cpp
@@ -52,6 +52,7 @@ static void runAtExitHandlers() {
 #include <sys/stat.h>
 #endif
 #include <fcntl.h>
+#include <unistd.h>
 /* stat functions are redirecting to __xstat with a version number.  On x86-64
  * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
  * available as an exported symbol, so we have to add it explicitly.
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 4475f4d..7c8a740 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -59,6 +59,7 @@ MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji,
              bool AllocateGVsWithCode)
   : ExecutionEngine(m), TM(tm), MemMgr(MM), M(m), OS(Buffer), Dyld(MM) {
 
+  setTargetData(TM->getTargetData());
   PM.add(new TargetData(*TM->getTargetData()));
 
   // Turn the machine code intermediate representation into bytes in memory
@@ -124,8 +125,8 @@ GenericValue MCJIT::runFunction(Function *F,
 
   void *FPtr = getPointerToFunction(F);
   assert(FPtr && "Pointer to fn's code was null after getPointerToFunction");
-  const FunctionType *FTy = F->getFunctionType();
-  const Type *RetTy = FTy->getReturnType();
+  FunctionType *FTy = F->getFunctionType();
+  Type *RetTy = FTy->getReturnType();
 
   assert((FTy->getNumParams() == ArgValues.size() ||
           (FTy->isVarArg() && FTy->getNumParams() <= ArgValues.size())) &&
@@ -216,6 +217,6 @@ GenericValue MCJIT::runFunction(Function *F,
     }
   }
 
-  assert("Full-featured argument passing not supported yet!");
+  assert(0 && "Full-featured argument passing not supported yet!");
   return GenericValue();
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
index 59bdfee..c236d1d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
+++ b/lib/ExecutionEngine/RuntimeDyld/CMakeLists.txt
@@ -2,3 +2,8 @@ add_llvm_library(LLVMRuntimeDyld
   RuntimeDyld.cpp
   RuntimeDyldMachO.cpp
   )
+
+add_llvm_library_dependencies(LLVMRuntimeDyld
+  LLVMObject
+  LLVMSupport
+  )
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index bcdfb04..7190a3c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -143,7 +143,7 @@ public:
 
   bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
     return isKnownFormat(InputBuffer);
-  };
+  }
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index f51aff3..004b865 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -17,11 +17,11 @@
 #include "llvm/Module.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 /// selectTarget - Pick a target either via -march or by guessing the native
@@ -30,6 +30,8 @@ TargetMachine *EngineBuilder::selectTarget(Module *Mod,
                               StringRef MArch,
                               StringRef MCPU,
                               const SmallVectorImpl<std::string>& MAttrs,
+                              Reloc::Model RM,
+                              CodeModel::Model CM,
                               std::string *ErrorStr) {
   Triple TheTriple(Mod->getTargetTriple());
   if (TheTriple.getTriple().empty())
@@ -83,8 +85,9 @@ TargetMachine *EngineBuilder::selectTarget(Module *Mod,
   }
 
   // Allocate a target...
-  TargetMachine *Target =
-    TheTarget->createTargetMachine(TheTriple.getTriple(), MCPU, FeaturesStr);
+  TargetMachine *Target = TheTarget->createTargetMachine(TheTriple.getTriple(),
+                                                         MCPU, FeaturesStr,
+                                                         RM, CM);
   assert(Target && "Could not allocate target machine!");
   return Target;
 }
diff --git a/lib/Linker/CMakeLists.txt b/lib/Linker/CMakeLists.txt
index 0b6d2f4..4d8824b 100644
--- a/lib/Linker/CMakeLists.txt
+++ b/lib/Linker/CMakeLists.txt
@@ -4,3 +4,11 @@ add_llvm_library(LLVMLinker
   LinkModules.cpp
   Linker.cpp
   )
+
+add_llvm_library_dependencies(LLVMLinker
+  LLVMArchive
+  LLVMBitReader
+  LLVMCore
+  LLVMSupport
+  LLVMTransformUtils
+  )
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index 55aa9bf..03a962e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -14,9 +14,12 @@
 #include "llvm/Linker.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
 #include "llvm/Module.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
@@ -139,7 +142,7 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
       return false;
   } else if (StructType *DSTy = dyn_cast<StructType>(DstTy)) {
     StructType *SSTy = cast<StructType>(SrcTy);
-    if (DSTy->isAnonymous() != SSTy->isAnonymous() ||
+    if (DSTy->isLiteral() != SSTy->isLiteral() ||
         DSTy->isPacked() != SSTy->isPacked())
       return false;
   } else if (ArrayType *DATy = dyn_cast<ArrayType>(DstTy)) {
@@ -223,7 +226,7 @@ Type *TypeMapTy::getImpl(Type *Ty) {
   
   // If this is not a named struct type, then just map all of the elements and
   // then rebuild the type from inside out.
-  if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isAnonymous()) {
+  if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isLiteral()) {
     // If there are no element types to map, then the type is itself.  This is
     // true for the anonymous {} struct, things like 'float', integers, etc.
     if (Ty->getNumContainedTypes() == 0)
@@ -261,7 +264,7 @@ Type *TypeMapTy::getImpl(Type *Ty) {
                                       cast<PointerType>(Ty)->getAddressSpace());
     case Type::FunctionTyID:
       return *Entry = FunctionType::get(ElementTypes[0],
-                                        ArrayRef<Type*>(ElementTypes).slice(1),
+                                        makeArrayRef(ElementTypes).slice(1),
                                         cast<FunctionType>(Ty)->isVarArg());
     case Type::StructTyID:
       // Note that this is only reached for anonymous structs.
@@ -302,7 +305,7 @@ Type *TypeMapTy::getImpl(Type *Ty) {
   // Otherwise we create a new type and resolve its body later.  This will be
   // resolved by the top level of get().
   DefinitionsToResolve.push_back(STy);
-  return *Entry = StructType::createNamed(STy->getContext(), "");
+  return *Entry = StructType::create(STy->getContext());
 }
 
 
@@ -333,10 +336,16 @@ namespace {
     
     std::vector<AppendingVarInfo> AppendingVars;
     
+    unsigned Mode; // Mode to treat source module.
+    
+    // Set of items not to link in from source.
+    SmallPtrSet<const Value*, 16> DoNotLinkFromSource;
+    
   public:
     std::string ErrorMsg;
     
-    ModuleLinker(Module *dstM, Module *srcM) : DstM(dstM), SrcM(srcM) { }
+    ModuleLinker(Module *dstM, Module *srcM, unsigned mode)
+      : DstM(dstM), SrcM(srcM), Mode(mode) { }
     
     bool run();
     
@@ -596,9 +605,9 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   DstGV->replaceAllUsesWith(ConstantExpr::getBitCast(NG, DstGV->getType()));
   DstGV->eraseFromParent();
   
-  // Zap the initializer in the source variable so we don't try to link it.
-  SrcGV->setInitializer(0);
-  SrcGV->setLinkage(GlobalValue::ExternalLinkage);
+  // Track the source variable so we don't try to link it.
+  DoNotLinkFromSource.insert(SrcGV);
+  
   return false;
 }
 
@@ -633,11 +642,10 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
       // Make sure to remember this mapping.
       ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
       
-      // Destroy the source global's initializer (and convert it to a prototype)
-      // so that we don't attempt to copy it over when processing global
-      // initializers.
-      SGV->setInitializer(0);
-      SGV->setLinkage(GlobalValue::ExternalLinkage);
+      // Track the source global so that we don't attempt to copy it over when 
+      // processing global initializers.
+      DoNotLinkFromSource.insert(SGV);
+      
       return false;
     }
   }
@@ -682,8 +690,10 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
       // Make sure to remember this mapping.
       ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
       
-      // Remove the body from the source module so we don't attempt to remap it.
-      SF->deleteBody();
+      // Track the function from the source module so we don't attempt to remap 
+      // it.
+      DoNotLinkFromSource.insert(SF);
+      
       return false;
     }
   }
@@ -722,8 +732,9 @@ bool ModuleLinker::linkAliasProto(GlobalAlias *SGA) {
       // Make sure to remember this mapping.
       ValueMap[SGA] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGA->getType()));
       
-      // Remove the body from the source module so we don't attempt to remap it.
-      SGA->setAliasee(0);
+      // Track the alias from the source module so we don't attempt to remap it.
+      DoNotLinkFromSource.insert(SGA);
+      
       return false;
     }
   }
@@ -779,7 +790,9 @@ void ModuleLinker::linkGlobalInits() {
   // Loop over all of the globals in the src module, mapping them over as we go
   for (Module::const_global_iterator I = SrcM->global_begin(),
        E = SrcM->global_end(); I != E; ++I) {
-    if (!I->hasInitializer()) continue;      // Only process initialized GV's.
+    
+    // Only process initialized GV's or ones not already in dest.
+    if (!I->hasInitializer() || DoNotLinkFromSource.count(I)) continue;          
     
     // Grab destination global variable.
     GlobalVariable *DGV = cast<GlobalVariable>(ValueMap[I]);
@@ -805,31 +818,42 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
     ValueMap[I] = DI;
   }
 
-  // Splice the body of the source function into the dest function.
-  Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
-
-  // At this point, all of the instructions and values of the function are now
-  // copied over.  The only problem is that they are still referencing values in
-  // the Source function as operands.  Loop through all of the operands of the
-  // functions and patch them up to point to the local versions.
-  for (Function::iterator BB = Dst->begin(), BE = Dst->end(); BB != BE; ++BB)
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries, &TypeMap);
-
+  if (Mode == Linker::DestroySource) {
+    // Splice the body of the source function into the dest function.
+    Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
+    
+    // At this point, all of the instructions and values of the function are now
+    // copied over.  The only problem is that they are still referencing values in
+    // the Source function as operands.  Loop through all of the operands of the
+    // functions and patch them up to point to the local versions.
+    for (Function::iterator BB = Dst->begin(), BE = Dst->end(); BB != BE; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries, &TypeMap);
+    
+  } else {
+    // Clone the body of the function into the dest function.
+    SmallVector<ReturnInst*, 8> Returns; // Ignore returns.
+    CloneFunctionInto(Dst, Src, ValueMap, false, Returns);
+  }
+  
   // There is no need to map the arguments anymore.
   for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
        I != E; ++I)
     ValueMap.erase(I);
+  
 }
 
 
 void ModuleLinker::linkAliasBodies() {
   for (Module::alias_iterator I = SrcM->alias_begin(), E = SrcM->alias_end();
-       I != E; ++I)
+       I != E; ++I) {
+    if (DoNotLinkFromSource.count(I))
+      continue;
     if (Constant *Aliasee = I->getAliasee()) {
       GlobalAlias *DA = cast<GlobalAlias>(ValueMap[I]);
       DA->setAliasee(MapValue(Aliasee, ValueMap, RF_None, &TypeMap));
     }
+  }
 }
 
 /// linkNamedMDNodes - Insert all of the named mdnodes in Src into the Dest
@@ -891,16 +915,10 @@ bool ModuleLinker::run() {
   StringRef ModuleId = SrcM->getModuleIdentifier();
   if (!ModuleId.empty())
     DstM->removeLibrary(sys::path::stem(ModuleId));
-
   
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
-  // Remap all of the named mdnoes in Src into the DstM module. We do this
-  // after linking GlobalValues so that MDNodes that reference GlobalValues
-  // are properly remapped.
-  linkNamedMDNodes();
-
   // Insert all of the globals in src into the DstM module... without linking
   // initializers (which could refer to functions not yet mapped over).
   for (Module::global_iterator I = SrcM->global_begin(),
@@ -933,7 +951,17 @@ bool ModuleLinker::run() {
   // Link in the function bodies that are defined in the source module into
   // DstM.
   for (Module::iterator SF = SrcM->begin(), E = SrcM->end(); SF != E; ++SF) {
-    if (SF->isDeclaration()) continue;      // No body if function is external.
+    
+    // Skip if not linking from source.
+    if (DoNotLinkFromSource.count(SF)) continue;
+    
+    // Skip if no body (function is external) or materialize.
+    if (SF->isDeclaration()) {
+      if (!SF->isMaterializable())
+        continue;
+      if (SF->Materialize(&ErrorMsg))
+        return true;
+    }
     
     linkFunctionBody(cast<Function>(ValueMap[SF]), SF);
   }
@@ -941,6 +969,11 @@ bool ModuleLinker::run() {
   // Resolve all uses of aliases with aliasees.
   linkAliasBodies();
 
+  // Remap all of the named mdnoes in Src into the DstM module. We do this
+  // after linking GlobalValues so that MDNodes that reference GlobalValues
+  // are properly remapped.
+  linkNamedMDNodes();
+
   // Now that all of the types from the source are used, resolve any structs
   // copied over to the dest that didn't exist there.
   TypeMap.linkDefinedTypeBodies();
@@ -957,8 +990,9 @@ bool ModuleLinker::run() {
 // error occurs, true is returned and ErrorMsg (if not null) is set to indicate
 // the problem.  Upon failure, the Dest module could be in a modified state, and
 // shouldn't be relied on to be consistent.
-bool Linker::LinkModules(Module *Dest, Module *Src, std::string *ErrorMsg) {
-  ModuleLinker TheLinker(Dest, Src);
+bool Linker::LinkModules(Module *Dest, Module *Src, unsigned Mode, 
+                         std::string *ErrorMsg) {
+  ModuleLinker TheLinker(Dest, Src, Mode);
   if (TheLinker.run()) {
     if (ErrorMsg) *ErrorMsg = TheLinker.ErrorMsg;
     return true;
diff --git a/lib/Linker/Linker.cpp b/lib/Linker/Linker.cpp
index fba91da..59fbceb 100644
--- a/lib/Linker/Linker.cpp
+++ b/lib/Linker/Linker.cpp
@@ -141,6 +141,14 @@ static inline sys::Path IsLibrary(StringRef Name,
   if (FullPath.isBitcodeFile())    // .so file containing bitcode?
     return FullPath;
 
+  // Try libX form, to make it possible to add dependency on the
+  // specific version of .so, like liblzma.so.1.0.0
+  FullPath.eraseSuffix();
+  if (FullPath.isDynamicLibrary())  // Native shared library?
+    return FullPath;
+  if (FullPath.isBitcodeFile())    // .so file containing bitcode?
+    return FullPath;
+
   // Not found .. fall through
 
   // Indicate that the library was not found in the directory.
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 22afa7e..a4ac1bf 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -1,25 +1,31 @@
 add_llvm_library(LLVMMC
   ELFObjectWriter.cpp
+  MCAsmBackend.cpp
   MCAsmInfo.cpp
   MCAsmInfoCOFF.cpp
   MCAsmInfoDarwin.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
+  MCAtom.cpp
   MCCodeEmitter.cpp
+  MCCodeGenInfo.cpp
   MCContext.cpp
   MCDisassembler.cpp
+  MCDwarf.cpp
   MCELF.cpp
   MCELFObjectTargetWriter.cpp
   MCELFStreamer.cpp
   MCExpr.cpp
   MCInst.cpp
   MCInstPrinter.cpp
+  MCInstrAnalysis.cpp
   MCLabel.cpp
-  MCDwarf.cpp
   MCLoggingStreamer.cpp
   MCMachOStreamer.cpp
   MCMachObjectTargetWriter.cpp
+  MCModule.cpp
   MCNullStreamer.cpp
+  MCObjectFileInfo.cpp
   MCObjectStreamer.cpp
   MCObjectWriter.cpp
   MCPureStreamer.cpp
@@ -30,13 +36,18 @@ add_llvm_library(LLVMMC
   MCStreamer.cpp
   MCSubtargetInfo.cpp
   MCSymbol.cpp
+  MCTargetAsmLexer.cpp
   MCValue.cpp
   MCWin64EH.cpp
   MachObjectWriter.cpp
-  WinCOFFStreamer.cpp
-  WinCOFFObjectWriter.cpp
   SubtargetFeature.cpp
-  TargetAsmBackend.cpp
+  WinCOFFObjectWriter.cpp
+  WinCOFFStreamer.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMC
+  LLVMObject
+  LLVMSupport
   )
 
 add_subdirectory(MCParser)
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 59e1b8e..3d16de5 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -23,13 +24,13 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ELF.h"
-#include "llvm/Target/TargetAsmBackend.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 
-#include "../Target/X86/X86FixupKinds.h"
-#include "../Target/ARM/ARMFixupKinds.h"
+#include "../Target/X86/MCTargetDesc/X86FixupKinds.h"
+#include "../Target/ARM/MCTargetDesc/ARMFixupKinds.h"
+#include "../Target/PowerPC/MCTargetDesc/PPCFixupKinds.h"
 
 #include <vector>
 using namespace llvm;
@@ -124,12 +125,12 @@ void ELFObjectWriter::WriteHeader(uint64_t SectionDataSize,
 
   // e_shnum     = # of section header ents
   if (NumberOfSections >= ELF::SHN_LORESERVE)
-    Write16(0);
+    Write16(ELF::SHN_UNDEF);
   else
     Write16(NumberOfSections);
 
   // e_shstrndx  = Section # of '.shstrtab'
-  if (NumberOfSections >= ELF::SHN_LORESERVE)
+  if (ShstrtabIndex >= ELF::SHN_LORESERVE)
     Write16(ELF::SHN_XINDEX);
   else
     Write16(ShstrtabIndex);
@@ -301,7 +302,8 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
     if (Section.getType() == ELF::SHT_RELA ||
         Section.getType() == ELF::SHT_REL ||
         Section.getType() == ELF::SHT_STRTAB ||
-        Section.getType() == ELF::SHT_SYMTAB)
+        Section.getType() == ELF::SHT_SYMTAB ||
+        Section.getType() == ELF::SHT_SYMTAB_SHNDX)
       continue;
     WriteSymbolEntry(SymtabF, ShndxF, 0, ELF::STT_SECTION, 0, 0,
                      ELF::STV_DEFAULT, SectionIndexMap.lookup(&Section), false);
@@ -447,8 +449,16 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   uint64_t RelocOffset = Layout.getFragmentOffset(Fragment) +
     Fixup.getOffset();
 
+  adjustFixupOffset(Fixup, RelocOffset);
+
   if (!hasRelocationAddend())
     Addend = 0;
+
+  if (is64Bit())
+    assert(isInt<64>(Addend));
+  else
+    assert(isInt<32>(Addend));
+
   ELFRelocationEntry ERE(RelocOffset, Index, Type, RelocSymbol, Addend);
   Relocations[Fragment->getParent()].push_back(ERE);
 }
@@ -656,6 +666,9 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
     ExternalSymbolData[i].SymbolData->setIndex(Index++);
   for (unsigned i = 0, e = UndefinedSymbolData.size(); i != e; ++i)
     UndefinedSymbolData[i].SymbolData->setIndex(Index++);
+
+  if (NumRegularSections > ELF::SHN_LORESERVE)
+    NeedsSymtabShndx = true;
 }
 
 void ELFObjectWriter::CreateRelocationSections(MCAssembler &Asm,
@@ -992,11 +1005,10 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
     // Nothing to do.
     break;
 
-  case ELF::SHT_GROUP: {
+  case ELF::SHT_GROUP:
     sh_link = SymbolTableIndex;
     sh_info = GroupSymbolIndex;
     break;
-  }
 
   default:
     assert(0 && "FIXME: sh_type value not supported!");
@@ -1224,7 +1236,7 @@ void ELFObjectWriter::WriteObject(MCAssembler &Asm,
 
   FileOff = OS.tell();
 
-  // ... and then the remainting sections ...
+  // ... and then the remaining sections ...
   for (unsigned i = NumRegularSections + 1; i < NumSections; ++i)
     WriteDataSectionData(Asm, Layout, *Sections[i]);
 }
@@ -1252,6 +1264,11 @@ MCObjectWriter *llvm::createELFObjectWriter(MCELFObjectTargetWriter *MOTW,
       return new ARMELFObjectWriter(MOTW, OS, IsLittleEndian); break;
     case ELF::EM_MBLAZE:
       return new MBlazeELFObjectWriter(MOTW, OS, IsLittleEndian); break;
+    case ELF::EM_PPC:
+    case ELF::EM_PPC64:
+      return new PPCELFObjectWriter(MOTW, OS, IsLittleEndian); break;
+    case ELF::EM_MIPS:
+      return new MipsELFObjectWriter(MOTW, OS, IsLittleEndian); break;
     default: llvm_unreachable("Unsupported architecture"); break;
   }
 }
@@ -1503,6 +1520,76 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   return Type;
 }
 
+//===- PPCELFObjectWriter -------------------------------------------===//
+
+PPCELFObjectWriter::PPCELFObjectWriter(MCELFObjectTargetWriter *MOTW,
+                                             raw_ostream &_OS,
+                                             bool IsLittleEndian)
+  : ELFObjectWriter(MOTW, _OS, IsLittleEndian) {
+}
+
+PPCELFObjectWriter::~PPCELFObjectWriter() {
+}
+
+unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
+                                             const MCFixup &Fixup,
+                                             bool IsPCRel,
+                                             bool IsRelocWithSymbol,
+                                             int64_t Addend) {
+  // determine the type of the relocation
+  unsigned Type;
+  if (IsPCRel) {
+    switch ((unsigned)Fixup.getKind()) {
+    default:
+      llvm_unreachable("Unimplemented");
+    case PPC::fixup_ppc_br24:
+      Type = ELF::R_PPC_REL24;
+      break;
+    case FK_PCRel_4:
+      Type = ELF::R_PPC_REL32;
+      break;
+    }
+  } else {
+    switch ((unsigned)Fixup.getKind()) {
+      default: llvm_unreachable("invalid fixup kind!");
+    case PPC::fixup_ppc_br24:
+      Type = ELF::R_PPC_ADDR24;
+      break;
+    case PPC::fixup_ppc_brcond14:
+      Type = ELF::R_PPC_ADDR14_BRTAKEN; // XXX: or BRNTAKEN?_
+      break;
+    case PPC::fixup_ppc_ha16:
+      Type = ELF::R_PPC_ADDR16_HA;
+      break;
+    case PPC::fixup_ppc_lo16:
+      Type = ELF::R_PPC_ADDR16_LO;
+      break;
+    case PPC::fixup_ppc_lo14:
+      Type = ELF::R_PPC_ADDR14;
+      break;
+    case FK_Data_4:
+      Type = ELF::R_PPC_ADDR32;
+      break;
+    case FK_Data_2:
+      Type = ELF::R_PPC_ADDR16;
+      break;
+    }
+  }
+  return Type;
+}
+
+void
+PPCELFObjectWriter::adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
+  switch ((unsigned)Fixup.getKind()) {
+    case PPC::fixup_ppc_ha16:
+    case PPC::fixup_ppc_lo16:
+      RelocOffset += 2;
+      break;
+    default:
+      break;
+  }
+}
+
 //===- MBlazeELFObjectWriter -------------------------------------------===//
 
 MBlazeELFObjectWriter::MBlazeELFObjectWriter(MCELFObjectTargetWriter *MOTW,
@@ -1624,7 +1711,6 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
       default: llvm_unreachable("invalid fixup kind!");
       case FK_Data_8: Type = ELF::R_X86_64_64; break;
       case X86::reloc_signed_4byte:
-        assert(isInt<32>(Target.getConstant()));
         switch (Modifier) {
         default:
           llvm_unreachable("Unimplemented");
@@ -1728,3 +1814,19 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
 
   return Type;
 }
+
+MipsELFObjectWriter::MipsELFObjectWriter(MCELFObjectTargetWriter *MOTW,
+                                         raw_ostream &_OS,
+                                         bool IsLittleEndian)
+  : ELFObjectWriter(MOTW, _OS, IsLittleEndian) {}
+
+MipsELFObjectWriter::~MipsELFObjectWriter() {}
+
+unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel,
+                                           bool IsRelocWithSymbol,
+                                           int64_t Addend) {
+  // tbd
+  return 1;
+}
diff --git a/lib/MC/ELFObjectWriter.h b/lib/MC/ELFObjectWriter.h
index 7593099..862b085 100644
--- a/lib/MC/ELFObjectWriter.h
+++ b/lib/MC/ELFObjectWriter.h
@@ -347,6 +347,7 @@ class ELFObjectWriter : public MCObjectWriter {
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend) = 0;
+    virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) { }
   };
 
   //===- X86ELFObjectWriter -------------------------------------------===//
@@ -395,6 +396,22 @@ class ELFObjectWriter : public MCObjectWriter {
     
   };
 
+  //===- PPCELFObjectWriter -------------------------------------------===//
+
+  class PPCELFObjectWriter : public ELFObjectWriter {
+  public:
+    PPCELFObjectWriter(MCELFObjectTargetWriter *MOTW,
+                          raw_ostream &_OS,
+                          bool IsLittleEndian);
+
+    virtual ~PPCELFObjectWriter();
+  protected:
+    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsPCRel, bool IsRelocWithSymbol,
+                                  int64_t Addend);
+    virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset);
+  };
+
   //===- MBlazeELFObjectWriter -------------------------------------------===//
 
   class MBlazeELFObjectWriter : public ELFObjectWriter {
@@ -409,6 +426,21 @@ class ELFObjectWriter : public MCObjectWriter {
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend);
   };
+
+  //===- MipsELFObjectWriter -------------------------------------------===//
+
+  class MipsELFObjectWriter : public ELFObjectWriter {
+  public:
+    MipsELFObjectWriter(MCELFObjectTargetWriter *MOTW,
+                        raw_ostream &_OS,
+                        bool IsLittleEndian);
+
+    virtual ~MipsELFObjectWriter();
+  protected:
+    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
+                                  bool IsPCRel, bool IsRelocWithSymbol,
+                                  int64_t Addend);
+  };
 }
 
 #endif
diff --git a/lib/MC/TargetAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 1927557..2c150f4 100644
--- a/lib/MC/TargetAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -1,4 +1,4 @@
-//===-- TargetAsmBackend.cpp - Target Assembly Backend ---------------------==//
+//===-- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmBackend.h"
+#include "llvm/MC/MCAsmBackend.h"
 using namespace llvm;
 
-TargetAsmBackend::TargetAsmBackend()
+MCAsmBackend::MCAsmBackend()
   : HasReliableSymbolDifference(false)
 {
 }
 
-TargetAsmBackend::~TargetAsmBackend() {
+MCAsmBackend::~MCAsmBackend() {
 }
 
 const MCFixupKindInfo &
-TargetAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   static const MCFixupKindInfo Builtins[] = {
     { "FK_Data_1", 0, 8, 0 },
     { "FK_Data_2", 0, 16, 0 },
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 502b60b..95861bc 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -29,6 +29,7 @@ MCAsmInfo::MCAsmInfo() {
   HasSubsectionsViaSymbols = false;
   HasMachoZeroFillDirective = false;
   HasMachoTBSSDirective = false;
+  StructorOutputOrder = Structors::ReversePriorityOrder;
   HasStaticCtorDtorReferenceInStaticMode = false;
   LinkerRequiresNonEmptyDwarfLines = false;
   MaxInstLength = 4;
@@ -42,6 +43,9 @@ MCAsmInfo::MCAsmInfo() {
   LinkerPrivateGlobalPrefix = "";
   InlineAsmStart = "APP";
   InlineAsmEnd = "NO_APP";
+  Code16Directive = ".code16";
+  Code32Directive = ".code32";
+  Code64Directive = ".code64";
   AssemblerDialect = 0;
   AllowQuotesInName = false;
   AllowNameToStartWithDigit = false;
@@ -53,6 +57,12 @@ MCAsmInfo::MCAsmInfo() {
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
   Data64bitsDirective = "\t.quad\t";
+  DataBegin = "$d.";
+  CodeBegin = "$a.";
+  JT8Begin = "$d.";
+  JT16Begin = "$d.";
+  JT32Begin = "$d.";
+  SupportsDataRegions = false;
   SunStyleELFSectionSwitchSyntax = false;
   UsesELFSectionDirectiveForBSS = false;
   AlignDirective = "\t.align\t";
@@ -62,7 +72,7 @@ MCAsmInfo::MCAsmInfo() {
   GlobalDirective = "\t.globl\t";
   HasSetDirective = true;
   HasAggressiveSymbolFolding = true;
-  HasLCOMMDirective = false;
+  LCOMMDirectiveType = LCOMM::None;
   COMMDirectiveAlignmentIsInBytes = true;
   HasDotTypeDotSizeDirective = true;
   HasSingleParameterDotFile = true;
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 7fc7d7a..434d910 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 MCAsmInfoCOFF::MCAsmInfoCOFF() {
   GlobalPrefix = "_";
   COMMDirectiveAlignmentIsInBytes = false;
-  HasLCOMMDirective = true;
+  LCOMMDirectiveType = LCOMM::ByteAlignment;
   HasDotTypeDotSizeDirective = false;
   HasSingleParameterDotFile = false;
   PrivateGlobalPrefix = "L";  // Prefix for private global symbols
@@ -27,11 +27,14 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   LinkOnceDirective = "\t.linkonce discard\n";
   
   // Doesn't support visibility:
-  HiddenVisibilityAttr = ProtectedVisibilityAttr = MCSA_Invalid;
+  HiddenVisibilityAttr = HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
   SupportsDebugInformation = true;
   DwarfSectionOffsetDirective = "\t.secrel32\t";
   HasMicrosoftFastStdCallMangling = true;
+
+  SupportsDataRegions = false;
 }
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index 5851cb0..b20e338 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -39,8 +39,16 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   ZeroDirective = "\t.space\t";  // ".space N" emits N zeros.
   HasMachoZeroFillDirective = true;  // Uses .zerofill
   HasMachoTBSSDirective = true; // Uses .tbss
+  StructorOutputOrder = Structors::PriorityOrder;
   HasStaticCtorDtorReferenceInStaticMode = true;
 
+  CodeBegin = "L$start$code$";
+  DataBegin = "L$start$data$";
+  JT8Begin  = "L$start$jt8$";
+  JT16Begin = "L$start$jt16$";
+  JT32Begin = "L$start$jt32$";
+  SupportsDataRegions = true;
+
   // FIXME: Darwin 10 and newer don't need this.
   LinkerRequiresNonEmptyDwarfLines = true;
 
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index d5d08e8..3fcbb05 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -15,8 +15,12 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -25,9 +29,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetAsmBackend.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 #include <cctype>
 using namespace llvm;
 
@@ -40,7 +41,7 @@ protected:
 private:
   OwningPtr<MCInstPrinter> InstPrinter;
   OwningPtr<MCCodeEmitter> Emitter;
-  OwningPtr<TargetAsmBackend> AsmBackend;
+  OwningPtr<MCAsmBackend> AsmBackend;
 
   SmallString<128> CommentToEmit;
   raw_svector_ostream CommentStream;
@@ -63,7 +64,7 @@ public:
   MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
                 bool isVerboseAsm, bool useLoc, bool useCFI,
                 MCInstPrinter *printer, MCCodeEmitter *emitter,
-                TargetAsmBackend *asmbackend,
+                MCAsmBackend *asmbackend,
                 bool showInst)
     : MCStreamer(Context), OS(os), MAI(Context.getAsmInfo()),
       InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
@@ -157,7 +158,9 @@ public:
   ///
   /// @param Symbol - The common symbol to emit.
   /// @param Size - The size of the common symbol.
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size);
+  /// @param Size - The alignment of the common symbol in bytes.
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             unsigned Size = 0, unsigned ByteAlignment = 0);
@@ -334,8 +337,9 @@ void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   default: assert(0 && "Invalid flag!");
   case MCAF_SyntaxUnified:         OS << "\t.syntax unified"; break;
   case MCAF_SubsectionsViaSymbols: OS << ".subsections_via_symbols"; break;
-  case MCAF_Code16:                OS << "\t.code\t16"; break;
-  case MCAF_Code32:                OS << "\t.code\t32"; break;
+  case MCAF_Code16:                OS << '\t'<< MAI.getCode16Directive(); break;
+  case MCAF_Code32:                OS << '\t'<< MAI.getCode32Directive(); break;
+  case MCAF_Code64:                OS << '\t'<< MAI.getCode64Directive(); break;
   }
   EmitEOL();
 }
@@ -482,9 +486,16 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 ///
 /// @param Symbol - The common symbol to emit.
 /// @param Size - The size of the common symbol.
-void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
-  assert(MAI.hasLCOMMDirective() && "Doesn't have .lcomm, can't emit it!");
+void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                          unsigned ByteAlign) {
+  assert(MAI.getLCOMMDirectiveType() != LCOMM::None &&
+         "Doesn't have .lcomm, can't emit it!");
   OS << "\t.lcomm\t" << *Symbol << ',' << Size;
+  if (ByteAlign > 1) {
+    assert(MAI.getLCOMMDirectiveType() == LCOMM::ByteAlignment &&
+           "Alignment not supported on .lcomm!");
+    OS << ',' << ByteAlign;
+  }
   EmitEOL();
 }
 
@@ -827,8 +838,8 @@ void MCAsmStreamer::EmitCFIEndProc() {
 
 void MCAsmStreamer::EmitRegisterName(int64_t Register) {
   if (InstPrinter && !MAI.useDwarfRegNumForCFI()) {
-    const TargetAsmInfo &TAI = getContext().getTargetAsmInfo();
-    unsigned LLVMRegister = TAI.getLLVMRegNum(Register, true);
+    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+    unsigned LLVMRegister = MRI.getLLVMRegNum(Register, true);
     InstPrinter->printRegName(OS, LLVMRegister);
   } else {
     OS << Register;
@@ -994,6 +1005,19 @@ void MCAsmStreamer::EmitWin64EHHandler(const MCSymbol *Sym, bool Unwind,
   EmitEOL();
 }
 
+static const MCSection *getWin64EHTableSection(StringRef suffix,
+                                               MCContext &context) {
+  // FIXME: This doesn't belong in MCObjectFileInfo. However,
+  /// this duplicate code in MCWin64EH.cpp.
+  if (suffix == "")
+    return context.getObjectFileInfo()->getXDataSection();
+  return context.getCOFFSection((".xdata"+suffix).str(),
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
+}
+
 void MCAsmStreamer::EmitWin64EHHandlerData() {
   MCStreamer::EmitWin64EHHandlerData();
 
@@ -1003,8 +1027,7 @@ void MCAsmStreamer::EmitWin64EHHandlerData() {
   // data block is visible.
   MCWin64EHUnwindInfo *CurFrame = getCurrentW64UnwindInfo();
   StringRef suffix=MCWin64EHUnwindEmitter::GetSectionSuffix(CurFrame->Function);
-  const MCSection *xdataSect =
-    getContext().getTargetAsmInfo().getWin64EHTableSection(suffix);
+  const MCSection *xdataSect = getWin64EHTableSection(suffix, getContext());
   if (xdataSect)
     SwitchSectionNoChange(xdataSect);
 
@@ -1221,7 +1244,7 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
 
   // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
   if (InstPrinter)
-    InstPrinter->printInst(&Inst, OS);
+    InstPrinter->printInst(&Inst, OS, "");
   else
     Inst.print(OS, &MAI);
   EmitEOL();
@@ -1249,8 +1272,8 @@ MCStreamer *llvm::createAsmStreamer(MCContext &Context,
                                     formatted_raw_ostream &OS,
                                     bool isVerboseAsm, bool useLoc,
                                     bool useCFI, MCInstPrinter *IP,
-                                    MCCodeEmitter *CE, TargetAsmBackend *TAB,
+                                    MCCodeEmitter *CE, MCAsmBackend *MAB,
                                     bool ShowInst) {
   return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, useCFI,
-                           IP, CE, TAB, ShowInst);
+                           IP, CE, MAB, ShowInst);
 }
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 527a63c..06c8aec 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -25,8 +26,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmBackend.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -194,7 +194,7 @@ MCSymbolData::MCSymbolData(const MCSymbol &_Symbol, MCFragment *_Fragment,
 
 /* *** */
 
-MCAssembler::MCAssembler(MCContext &Context_, TargetAsmBackend &Backend_,
+MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
                          MCCodeEmitter &Emitter_, MCObjectWriter &Writer_,
                          raw_ostream &OS_)
   : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
diff --git a/lib/MC/MCAtom.cpp b/lib/MC/MCAtom.cpp
new file mode 100644
index 0000000..d714443
--- /dev/null
+++ b/lib/MC/MCAtom.cpp
@@ -0,0 +1,97 @@
+//===- lib/MC/MCAtom.cpp - MCAtom implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCModule.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+void MCAtom::addInst(const MCInst &I, uint64_t Address, unsigned Size) {
+  assert(Type == TextAtom && "Trying to add MCInst to a non-text atom!");
+
+  assert(Address < End+Size &&
+         "Instruction not contiguous with end of atom!");
+  if (Address > End)
+    Parent->remap(this, Begin, End+Size);
+
+  Text.push_back(std::make_pair(Address, I));
+}
+
+void MCAtom::addData(const MCData &D) {
+  assert(Type == DataAtom && "Trying to add MCData to a non-data atom!");
+  Parent->remap(this, Begin, End+1);
+
+  Data.push_back(D);
+}
+
+MCAtom *MCAtom::split(uint64_t SplitPt) {
+  assert((SplitPt > Begin && SplitPt <= End) &&
+         "Splitting at point not contained in atom!");
+
+  // Compute the new begin/end points.
+  uint64_t LeftBegin = Begin;
+  uint64_t LeftEnd = SplitPt - 1;
+  uint64_t RightBegin = SplitPt;
+  uint64_t RightEnd = End;
+
+  // Remap this atom to become the lower of the two new ones.
+  Parent->remap(this, LeftBegin, LeftEnd);
+
+  // Create a new atom for the higher atom.
+  MCAtom *RightAtom = Parent->createAtom(Type, RightBegin, RightEnd);
+
+  // Split the contents of the original atom between it and the new one.  The
+  // precise method depends on whether this is a data or a text atom.
+  if (isDataAtom()) {
+    std::vector<MCData>::iterator I = Data.begin() + (RightBegin - LeftBegin);
+
+    assert(I != Data.end() && "Split point not found in range!");
+
+    std::copy(I, Data.end(), RightAtom->Data.end());
+    Data.erase(I, Data.end());
+  } else if (isTextAtom()) {
+    std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
+
+    while (I != Text.end() && I->first < SplitPt) ++I;
+
+    assert(I != Text.end() && "Split point not found in disassembly!");
+    assert(I->first == SplitPt &&
+           "Split point does not fall on instruction boundary!");
+
+    std::copy(I, Text.end(), RightAtom->Text.end());
+    Text.erase(I, Text.end());
+  } else
+    llvm_unreachable("Unknown atom type!");
+
+  return RightAtom;
+}
+
+void MCAtom::truncate(uint64_t TruncPt) {
+  assert((TruncPt >= Begin && TruncPt < End) &&
+         "Truncation point not contained in atom!");
+
+  Parent->remap(this, Begin, TruncPt);
+
+  if (isDataAtom()) {
+    Data.resize(TruncPt - Begin + 1);
+  } else if (isTextAtom()) {
+    std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
+
+    while (I != Text.end() && I->first <= TruncPt) ++I;
+
+    assert(I != Text.end() && "Truncation point not found in disassembly!");
+    assert(I->first == TruncPt+1 &&
+           "Truncation point does not fall on instruction boundary");
+
+    Text.erase(I, Text.end());
+  } else
+    llvm_unreachable("Unknown atom type!");
+}
+
diff --git a/lib/MC/MCCodeGenInfo.cpp b/lib/MC/MCCodeGenInfo.cpp
new file mode 100644
index 0000000..236e7de
--- /dev/null
+++ b/lib/MC/MCCodeGenInfo.cpp
@@ -0,0 +1,21 @@
+//===-- MCCodeGenInfo.cpp - Target CodeGen Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file tracks information about the target which can affect codegen,
+// asm parsing, and asm printing. For example, relocation model.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCCodeGenInfo.h"
+using namespace llvm;
+
+void MCCodeGenInfo::InitMCCodeGenInfo(Reloc::Model RM, CodeModel::Model CM) {
+  RelocationModel = RM;
+  CMModel = CM;
+}
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 8faa72e..82690ee 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -9,13 +9,14 @@
 
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCLabel.h"
 #include "llvm/MC/MCDwarf.h"
-#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ELF.h"
@@ -26,8 +27,9 @@ typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
 typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
 
 
-MCContext::MCContext(const MCAsmInfo &mai, const TargetAsmInfo *tai) :
-  MAI(mai), TAI(tai),
+MCContext::MCContext(const MCAsmInfo &mai, const MCRegisterInfo &mri,
+                     const MCObjectFileInfo *mofi) :
+  MAI(mai), MRI(mri), MOFI(mofi),
   Allocator(), Symbols(Allocator), UsedNames(Allocator),
   NextUniqueID(0),
   CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0),
@@ -54,8 +56,6 @@ MCContext::~MCContext() {
 
   // If the stream for the .secure_log_unique directive was created free it.
   delete (raw_ostream*)SecureLog;
-
-  delete TAI;
 }
 
 //===----------------------------------------------------------------------===//
@@ -279,7 +279,8 @@ unsigned MCContext::GetDwarfFile(StringRef FileName, unsigned FileNumber) {
   } else {
     StringRef Directory = Slash.first;
     Name = Slash.second;
-    for (DirIndex = 0; DirIndex < MCDwarfDirs.size(); DirIndex++) {
+    DirIndex = 0;
+    for (unsigned End = MCDwarfDirs.size(); DirIndex < End; DirIndex++) {
       if (Directory == MCDwarfDirs[DirIndex])
         break;
     }
diff --git a/lib/MC/MCDisassembler/CMakeLists.txt b/lib/MC/MCDisassembler/CMakeLists.txt
index 0ce359d..4debb28 100644
--- a/lib/MC/MCDisassembler/CMakeLists.txt
+++ b/lib/MC/MCDisassembler/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 add_llvm_library(LLVMMCDisassembler
   Disassembler.cpp
   EDDisassembler.cpp
@@ -6,3 +5,26 @@ add_llvm_library(LLVMMCDisassembler
   EDOperand.cpp
   EDToken.cpp
   )
+
+add_llvm_library_dependencies(LLVMMCDisassembler
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  LLVMTarget
+  )
+
+foreach(t ${LLVM_TARGETS_TO_BUILD})
+  set(td ${LLVM_MAIN_SRC_DIR}/lib/Target/${t})
+  if(EXISTS ${td}/TargetInfo/CMakeLists.txt)
+    add_llvm_library_dependencies(LLVMMCDisassembler "LLVM${t}Info")
+  endif()
+  if(EXISTS ${td}/MCTargetDesc/CMakeLists.txt)
+    add_llvm_library_dependencies(LLVMMCDisassembler "LLVM${t}Desc")
+  endif()
+  if(EXISTS ${td}/AsmParser/CMakeLists.txt)
+    add_llvm_library_dependencies(LLVMMCDisassembler "LLVM${t}AsmParser")
+  endif()
+  if(EXISTS ${td}/Disassembler/CMakeLists.txt)
+    add_llvm_library_dependencies(LLVMMCDisassembler "LLVM${t}Disassembler")
+  endif()
+endforeach(t)
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index 5480b4b..16e66dc 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -1,4 +1,4 @@
-//===-- lib/MC/Disassembler.cpp - Disassembler Public C Interface -*- C -*-===//
+//===-- lib/MC/Disassembler.cpp - Disassembler Public C Interface ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,15 +11,14 @@
 #include "llvm-c/Disassembler.h"
 
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmInfo.h"  // FIXME.
-#include "llvm/Target/TargetMachine.h"  // FIXME.
-#include "llvm/Target/TargetSelect.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 
 namespace llvm {
 class Target;
@@ -38,10 +37,7 @@ LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
                                       LLVMSymbolLookupCallback SymbolLookUp) {
   // Initialize targets and assembly printers/parsers.
   llvm::InitializeAllTargetInfos();
-  // FIXME: We shouldn't need to initialize the Target(Machine)s.
-  llvm::InitializeAllTargets();
-  llvm::InitializeAllMCAsmInfos();
-  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllTargetMCs();
   llvm::InitializeAllAsmParsers();
   llvm::InitializeAllDisassemblers();
 
@@ -54,41 +50,38 @@ LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
   const MCAsmInfo *MAI = TheTarget->createMCAsmInfo(TripleName);
   assert(MAI && "Unable to create target asm info!");
 
+  const MCRegisterInfo *MRI = TheTarget->createMCRegInfo(TripleName);
+  assert(MRI && "Unable to create target register info!");
+
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
   std::string CPU;
 
-  // FIXME: We shouldn't need to do this (and link in codegen).
-  //        When we split this out, we should do it in a way that makes
-  //        it straightforward to switch subtargets on the fly.
-  TargetMachine *TM = TheTarget->createTargetMachine(TripleName, CPU,
-                                                     FeaturesStr);
-  assert(TM && "Unable to create target machine!");
-
-  // Get the target assembler info needed to setup the context.
-  const TargetAsmInfo *tai = new TargetAsmInfo(*TM);
-  assert(tai && "Unable to create target assembler!");
+  const MCSubtargetInfo *STI = TheTarget->createMCSubtargetInfo(TripleName, CPU,
+                                                                FeaturesStr);
+  assert(STI && "Unable to create subtarget info!");
 
   // Set up the MCContext for creating symbols and MCExpr's.
-  MCContext *Ctx = new MCContext(*MAI, tai);
+  MCContext *Ctx = new MCContext(*MAI, *MRI, 0);
   assert(Ctx && "Unable to create MCContext!");
 
   // Set up disassembler.
-  MCDisassembler *DisAsm = TheTarget->createMCDisassembler();
+  MCDisassembler *DisAsm = TheTarget->createMCDisassembler(*STI);
   assert(DisAsm && "Unable to create disassembler!");
-  DisAsm->setupForSymbolicDisassembly(GetOpInfo, DisInfo, Ctx);
+  DisAsm->setupForSymbolicDisassembly(GetOpInfo, SymbolLookUp, DisInfo, Ctx);
 
   // Set up the instruction printer.
   int AsmPrinterVariant = MAI->getAssemblerDialect();
   MCInstPrinter *IP = TheTarget->createMCInstPrinter(AsmPrinterVariant,
-                                                     *MAI);
+                                                     *MAI, *STI);
   assert(IP && "Unable to create instruction printer!");
 
   LLVMDisasmContext *DC = new LLVMDisasmContext(TripleName, DisInfo, TagType,
                                                 GetOpInfo, SymbolLookUp,
-                                                TheTarget, MAI, TM, tai, Ctx,
-                                                DisAsm, IP);
+                                                TheTarget, MAI, MRI,
+                                                Ctx, DisAsm, IP);
   assert(DC && "Allocation failure!");
+
   return DC;
 }
 
@@ -147,18 +140,35 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
   MCInst Inst;
   const MCDisassembler *DisAsm = DC->getDisAsm();
   MCInstPrinter *IP = DC->getIP();
-  if (!DisAsm->getInstruction(Inst, Size, MemoryObject, PC, /*REMOVE*/ nulls()))
+  MCDisassembler::DecodeStatus S;
+  S = DisAsm->getInstruction(Inst, Size, MemoryObject, PC,
+                             /*REMOVE*/ nulls(), DC->CommentStream);
+  switch (S) {
+  case MCDisassembler::Fail:
+  case MCDisassembler::SoftFail:
+    // FIXME: Do something different for soft failure modes?
     return 0;
 
-  SmallVector<char, 64> InsnStr;
-  raw_svector_ostream OS(InsnStr);
-  IP->printInst(&Inst, OS);
-  OS.flush();
+  case MCDisassembler::Success: {
+    DC->CommentStream.flush();
+    StringRef Comments = DC->CommentsToEmit.str();
 
-  assert(OutStringSize != 0 && "Output buffer cannot be zero size");
-  size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
-  std::memcpy(OutString, InsnStr.data(), OutputSize);
-  OutString[OutputSize] = '\0'; // Terminate string.
+    SmallVector<char, 64> InsnStr;
+    raw_svector_ostream OS(InsnStr);
+    IP->printInst(&Inst, OS, Comments);
+    OS.flush();
 
-  return Size;
+    // Tell the comment stream that the vector changed underneath it.
+    DC->CommentsToEmit.clear();
+    DC->CommentStream.resync();
+
+    assert(OutStringSize != 0 && "Output buffer cannot be zero size");
+    size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
+    std::memcpy(OutString, InsnStr.data(), OutputSize);
+    OutString[OutputSize] = '\0'; // Terminate string.
+
+    return Size;
+  }
+  }
+  return 0;
 }
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index f0ec42a..238ff7d 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -20,15 +20,16 @@
 #include "llvm-c/Disassembler.h"
 #include <string>
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
-class TargetAsmInfo;
 class MCContext;
 class MCAsmInfo;
 class MCDisassembler;
 class MCInstPrinter; 
+class MCRegisterInfo;
 class Target;
-class TargetMachine;
 
 //
 // This is the disassembler context returned by LLVMCreateDisasm().
@@ -58,12 +59,8 @@ private:
   const Target *TheTarget;
   // The assembly information for the target architecture.
   llvm::OwningPtr<const llvm::MCAsmInfo> MAI;
-  // The target machine instance.
-  llvm::OwningPtr<llvm::TargetMachine> TM;
-  // The disassembler for the target architecture.
-  // FIXME: using llvm::OwningPtr<const llvm::TargetAsmInfo> causes a malloc
-  //        error when this LLVMDisasmContext is deleted.
-  const TargetAsmInfo *Tai;
+  // The register information for the target architecture.
+  llvm::OwningPtr<const llvm::MCRegisterInfo> MRI;
   // The assembly context for creating symbols and MCExprs.
   llvm::OwningPtr<const llvm::MCContext> Ctx;
   // The disassembler for the target architecture.
@@ -72,22 +69,28 @@ private:
   llvm::OwningPtr<llvm::MCInstPrinter> IP;
 
 public:
+  // Comment stream and backing vector.
+  SmallString<128> CommentsToEmit;
+  raw_svector_ostream CommentStream;
+
   LLVMDisasmContext(std::string tripleName, void *disInfo, int tagType,
                     LLVMOpInfoCallback getOpInfo,
                     LLVMSymbolLookupCallback symbolLookUp,
                     const Target *theTarget, const MCAsmInfo *mAI,
-                    llvm::TargetMachine *tM, const TargetAsmInfo *tai,
+                    const MCRegisterInfo *mRI,
                     llvm::MCContext *ctx, const MCDisassembler *disAsm,
                     MCInstPrinter *iP) : TripleName(tripleName),
                     DisInfo(disInfo), TagType(tagType), GetOpInfo(getOpInfo),
-                    SymbolLookUp(symbolLookUp), TheTarget(theTarget), Tai(tai) {
-    TM.reset(tM);
+                    SymbolLookUp(symbolLookUp), TheTarget(theTarget),
+                    CommentStream(CommentsToEmit) {
     MAI.reset(mAI);
+    MRI.reset(mRI);
     Ctx.reset(ctx);
     DisAsm.reset(disAsm);
     IP.reset(iP);
   }
   const MCDisassembler *getDisAsm() const { return DisAsm.get(); }
+  const MCAsmInfo *getAsmInfo() const { return MAI.get(); }
   MCInstPrinter *getIP() { return IP.get(); }
 };
 
diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp
index bdd99af..83362a2 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.cpp
+++ b/lib/MC/MCDisassembler/EDDisassembler.cpp
@@ -22,20 +22,19 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Target/TargetAsmLexer.h"
-#include "llvm/Target/TargetAsmParser.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSelect.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
 using namespace llvm;
 
 bool EDDisassembler::sInitialized = false;
@@ -106,9 +105,7 @@ void EDDisassembler::initialize() {
   sInitialized = true;
   
   InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllMCAsmInfos();
-  InitializeAllAsmPrinters();
+  InitializeAllTargetMCs();
   InitializeAllAsmParsers();
   InitializeAllDisassemblers();
 }
@@ -169,24 +166,24 @@ EDDisassembler::EDDisassembler(CPUKey &key) :
   if (!Tgt)
     return;
   
-  std::string CPU;
-  std::string featureString;
-  TargetMachine.reset(Tgt->createTargetMachine(tripleString, CPU,
-                                               featureString));
+  MRI.reset(Tgt->createMCRegInfo(tripleString));
 
-  const TargetRegisterInfo *registerInfo = TargetMachine->getRegisterInfo();
-  
-  if (!registerInfo)
+  if (!MRI)
     return;
-    
-  initMaps(*registerInfo);
+
+  initMaps(*MRI);
   
   AsmInfo.reset(Tgt->createMCAsmInfo(tripleString));
   
   if (!AsmInfo)
     return;
 
-  Disassembler.reset(Tgt->createMCDisassembler());
+  STI.reset(Tgt->createMCSubtargetInfo(tripleString, "", ""));
+  
+  if (!STI)
+    return;
+
+  Disassembler.reset(Tgt->createMCDisassembler(*STI));
   
   if (!Disassembler)
     return;
@@ -195,16 +192,16 @@ EDDisassembler::EDDisassembler(CPUKey &key) :
   
   InstString.reset(new std::string);
   InstStream.reset(new raw_string_ostream(*InstString));
-  InstPrinter.reset(Tgt->createMCInstPrinter(LLVMSyntaxVariant, *AsmInfo));
+  InstPrinter.reset(Tgt->createMCInstPrinter(LLVMSyntaxVariant, *AsmInfo, *STI));
   
   if (!InstPrinter)
     return;
     
   GenericAsmLexer.reset(new AsmLexer(*AsmInfo));
-  SpecificAsmLexer.reset(Tgt->createAsmLexer(*AsmInfo));
+  SpecificAsmLexer.reset(Tgt->createMCAsmLexer(*MRI, *AsmInfo));
   SpecificAsmLexer->InstallLexer(*GenericAsmLexer);
   
-  initMaps(*TargetMachine->getRegisterInfo());
+  initMaps(*MRI);
     
   Valid = true;
 }
@@ -247,14 +244,17 @@ EDInst *EDDisassembler::createInst(EDByteReaderCallback byteReader,
   MCInst* inst = new MCInst;
   uint64_t byteSize;
   
-  if (!Disassembler->getInstruction(*inst,
-                                    byteSize,
-                                    memoryObject,
-                                    address,
-                                    ErrorStream)) {
+  MCDisassembler::DecodeStatus S;
+  S = Disassembler->getInstruction(*inst, byteSize, memoryObject, address,
+                                   ErrorStream, nulls());
+  switch (S) {
+  case MCDisassembler::Fail:
+  case MCDisassembler::SoftFail:
+    // FIXME: Do something different on soft failure mode?
     delete inst;
     return NULL;
-  } else {
+    
+  case MCDisassembler::Success: {
     const llvm::EDInstInfo *thisInstInfo = NULL;
 
     if (InstInfos) {
@@ -264,9 +264,11 @@ EDInst *EDDisassembler::createInst(EDByteReaderCallback byteReader,
     EDInst* sdInst = new EDInst(inst, byteSize, *this, thisInstInfo);
     return sdInst;
   }
+  }
+  return NULL;
 }
 
-void EDDisassembler::initMaps(const TargetRegisterInfo &registerInfo) {
+void EDDisassembler::initMaps(const MCRegisterInfo &registerInfo) {
   unsigned numRegisters = registerInfo.getNumRegs();
   unsigned registerIndex;
   
@@ -325,7 +327,7 @@ bool EDDisassembler::registerIsProgramCounter(unsigned registerID) {
 int EDDisassembler::printInst(std::string &str, MCInst &inst) {
   PrinterMutex.acquire();
   
-  InstPrinter->printInst(&inst, *InstStream);
+  InstPrinter->printInst(&inst, *InstStream, "");
   InstStream->flush();
   str = *InstString;
   InstString->clear();
@@ -368,16 +370,16 @@ int EDDisassembler::parseInst(SmallVectorImpl<MCParsedAsmOperand*> &operands,
   SourceMgr sourceMgr;
   sourceMgr.setDiagHandler(diag_handler, static_cast<void*>(this));
   sourceMgr.AddNewSourceBuffer(buf, SMLoc()); // ownership of buf handed over
-  MCContext context(*AsmInfo, NULL);
+  MCContext context(*AsmInfo, *MRI, NULL);
   OwningPtr<MCStreamer> streamer(createNullStreamer(context));
-  OwningPtr<MCAsmParser> genericParser(createMCAsmParser(*Tgt, sourceMgr,
+  OwningPtr<MCAsmParser> genericParser(createMCAsmParser(sourceMgr,
                                                          context, *streamer,
                                                          *AsmInfo));
 
   StringRef triple = tripleFromArch(Key.Arch);
   OwningPtr<MCSubtargetInfo> STI(Tgt->createMCSubtargetInfo(triple, "", ""));
-  OwningPtr<TargetAsmParser> TargetParser(Tgt->createAsmParser(*STI,
-                                                               *genericParser));
+  OwningPtr<MCTargetAsmParser>
+    TargetParser(Tgt->createMCAsmParser(*STI, *genericParser));
   
   AsmToken OpcodeToken = genericParser->Lex();
   AsmToken NextToken = genericParser->Lex();  // consume next token, because specificParser expects us to
diff --git a/lib/MC/MCDisassembler/EDDisassembler.h b/lib/MC/MCDisassembler/EDDisassembler.h
index 11d69c1..38c2203 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.h
+++ b/lib/MC/MCDisassembler/EDDisassembler.h
@@ -29,24 +29,23 @@
 
 namespace llvm {
 class AsmLexer;
+class AsmParser;
 class AsmToken;
 class MCContext;
 class MCAsmInfo;
 class MCAsmLexer;
-class AsmParser;
-class TargetAsmLexer;
-class TargetAsmParser;
 class MCDisassembler;
 class MCInstPrinter;
 class MCInst;
 class MCParsedAsmOperand;
+class MCRegisterInfo;
 class MCStreamer;
 class MCSubtargetInfo;
+class MCTargetAsmLexer;
+class MCTargetAsmParser;
 template <typename T> class SmallVectorImpl;
 class SourceMgr;
 class Target;
-class TargetMachine;
-class TargetRegisterInfo;
 
 struct EDInstInfo;
 struct EDInst;
@@ -136,10 +135,12 @@ struct EDDisassembler {
   CPUKey Key;
   /// The LLVM target corresponding to the disassembler
   const llvm::Target *Tgt;
-  /// The target machine instance.
-  llvm::OwningPtr<llvm::TargetMachine> TargetMachine;
   /// The assembly information for the target architecture
   llvm::OwningPtr<const llvm::MCAsmInfo> AsmInfo;
+  /// The subtarget information for the target architecture
+  llvm::OwningPtr<const llvm::MCSubtargetInfo> STI;
+  // The register information for the target architecture.
+  llvm::OwningPtr<const llvm::MCRegisterInfo> MRI;
   /// The disassembler for the target architecture
   llvm::OwningPtr<const llvm::MCDisassembler> Disassembler;
   /// The output string for the instruction printer; must be guarded with 
@@ -160,7 +161,7 @@ struct EDDisassembler {
   /// The target-specific lexer for use in tokenizing strings, in
   ///   target-independent and target-specific portions
   llvm::OwningPtr<llvm::AsmLexer> GenericAsmLexer;
-  llvm::OwningPtr<llvm::TargetAsmLexer> SpecificAsmLexer;
+  llvm::OwningPtr<llvm::MCTargetAsmLexer> SpecificAsmLexer;
   /// The guard for the above
   llvm::sys::Mutex ParserMutex;
   /// The LLVM number used for the target disassembly syntax variant
@@ -216,7 +217,7 @@ struct EDDisassembler {
   ///   info
   ///
   /// @arg registerInfo - the register information to use as a source
-  void initMaps(const llvm::TargetRegisterInfo &registerInfo);
+  void initMaps(const llvm::MCRegisterInfo &registerInfo);
   /// nameWithRegisterID - Returns the name (owned by the EDDisassembler) of a 
   ///   register for a given register ID, or NULL on failure
   ///
diff --git a/lib/MC/MCDisassembler/EDInst.h b/lib/MC/MCDisassembler/EDInst.h
index ceb9505..6b78dc8 100644
--- a/lib/MC/MCDisassembler/EDInst.h
+++ b/lib/MC/MCDisassembler/EDInst.h
@@ -73,7 +73,7 @@ struct EDInst {
   std::string String;
   /// The order in which operands from the InstInfo's operand information appear
   /// in String
-  const char* OperandOrder;
+  const signed char* OperandOrder;
   
   /// The result of the parseOperands() function
   CachedResult ParseResult;
diff --git a/lib/MC/MCDisassembler/EDToken.cpp b/lib/MC/MCDisassembler/EDToken.cpp
index de770b4..5f6c9df 100644
--- a/lib/MC/MCDisassembler/EDToken.cpp
+++ b/lib/MC/MCDisassembler/EDToken.cpp
@@ -87,14 +87,18 @@ int EDToken::registerID(unsigned &registerID) const {
 
 int EDToken::tokenize(std::vector<EDToken*> &tokens,
                       std::string &str,
-                      const char *operandOrder,
+                      const signed char *operandOrder,
                       EDDisassembler &disassembler) {
   SmallVector<MCParsedAsmOperand*, 5> parsedOperands;
   SmallVector<AsmToken, 10> asmTokens;
   
   if (disassembler.parseInst(parsedOperands, asmTokens, str))
+  {
+    for (unsigned i = 0, e = parsedOperands.size(); i != e; ++i)
+      delete parsedOperands[i];
     return -1;
-  
+  }
+      
   SmallVectorImpl<MCParsedAsmOperand*>::iterator operandIterator;
   unsigned int operandIndex;
   SmallVectorImpl<AsmToken>::iterator tokenIterator;
diff --git a/lib/MC/MCDisassembler/EDToken.h b/lib/MC/MCDisassembler/EDToken.h
index ba46707..384079b 100644
--- a/lib/MC/MCDisassembler/EDToken.h
+++ b/lib/MC/MCDisassembler/EDToken.h
@@ -125,7 +125,7 @@ struct EDToken {
   //                      assembly syntax
   static int tokenize(std::vector<EDToken*> &tokens,
                       std::string &str,
-                      const char *operandOrder,
+                      const signed char *operandOrder,
                       EDDisassembler &disassembler);
   
   /// getString - Directs a character pointer to the string, returning 0 on
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index ad86db1..4658a30 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -7,17 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmInfo.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -196,7 +197,7 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS,
   MCOS->EmitLabel(SectionEnd);
 
   // Switch back the the dwarf line section.
-  MCOS->SwitchSection(context.getTargetAsmInfo().getDwarfLineSection());
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
   const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo();
   MCOS->EmitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, SectionEnd,
@@ -209,7 +210,7 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS,
 void MCDwarfFileTable::Emit(MCStreamer *MCOS) {
   MCContext &context = MCOS->getContext();
   // Switch to the section where the table will be emitted into.
-  MCOS->SwitchSection(context.getTargetAsmInfo().getDwarfLineSection());
+  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
   // Create a symbol at the beginning of this section.
   MCSymbol *LineStartSym = context.CreateTempSymbol();
@@ -485,11 +486,11 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 }
 
 static const MachineLocation TranslateMachineLocation(
-                                                  const TargetAsmInfo &TAI,
+                                                  const MCRegisterInfo &MRI,
                                                   const MachineLocation &Loc) {
   unsigned Reg = Loc.getReg() == MachineLocation::VirtualFP ?
     MachineLocation::VirtualFP :
-    unsigned(TAI.getDwarfRegNum(Loc.getReg(), true));
+    unsigned(MRI.getDwarfRegNum(Loc.getReg(), true));
   const MachineLocation &NewLoc = Loc.isReg() ?
     MachineLocation(Reg) : MachineLocation(Reg, Loc.getOffset());
   return NewLoc;
@@ -503,10 +504,11 @@ namespace {
     bool IsEH;
     const MCSymbol *SectionStart;
   public:
-    FrameEmitterImpl(bool usingCFI, bool isEH, const MCSymbol *sectionStart) :
-      CFAOffset(0), CIENum(0), UsingCFI(usingCFI), IsEH(isEH),
-      SectionStart(sectionStart) {
-    }
+    FrameEmitterImpl(bool usingCFI, bool isEH)
+      : CFAOffset(0), CIENum(0), UsingCFI(usingCFI), IsEH(isEH),
+        SectionStart(0) {}
+
+    void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
 
     /// EmitCompactUnwind - Emit the unwind information in a compact way. If
     /// we're successful, return 'true'. Otherwise, return 'false' and it will
@@ -687,11 +689,8 @@ void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
 /// normal CIE and FDE.
 bool FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
                                          const MCDwarfFrameInfo &Frame) {
-#if 1
-  return false;
-#else
   MCContext &Context = Streamer.getContext();
-  const TargetAsmInfo &TAI = Context.getTargetAsmInfo();
+  const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
   bool VerboseAsm = Streamer.isVerboseAsm();
 
   // range-start range-length  compact-unwind-enc personality-func   lsda
@@ -716,19 +715,17 @@ bool FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
   //   .quad __gxx_personality
   //   .quad except_tab1
 
-  uint32_t Encoding =
-    TAI.getCompactUnwindEncoding(Frame.Instructions,
-                                 getDataAlignmentFactor(Streamer), IsEH);
+  uint32_t Encoding = Frame.CompactUnwindEncoding;
   if (!Encoding) return false;
 
   // The encoding needs to know we have an LSDA.
   if (Frame.Lsda)
     Encoding |= 0x40000000;
 
-  Streamer.SwitchSection(TAI.getCompactUnwindSection());
+  Streamer.SwitchSection(MOFI->getCompactUnwindSection());
 
   // Range Start
-  unsigned FDEEncoding = TAI.getFDEEncoding(UsingCFI);
+  unsigned FDEEncoding = MOFI->getFDEEncoding(UsingCFI);
   unsigned Size = getSizeForEncoding(Streamer, FDEEncoding);
   if (VerboseAsm) Streamer.AddComment("Range Start");
   Streamer.EmitSymbolValue(Frame.Function, Size);
@@ -745,6 +742,7 @@ bool FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
                                       Twine(llvm::utohexstr(Encoding)));
   Streamer.EmitIntValue(Encoding, Size);
 
+
   // Personality Function
   Size = getSizeForEncoding(Streamer, dwarf::DW_EH_PE_absptr);
   if (VerboseAsm) Streamer.AddComment("Personality Function");
@@ -762,7 +760,6 @@ bool FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
     Streamer.EmitIntValue(0, Size); // No LSDA
 
   return true;
-#endif
 }
 
 const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
@@ -771,11 +768,12 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
                                           const MCSymbol *lsda,
                                           unsigned lsdaEncoding) {
   MCContext &context = streamer.getContext();
-  const TargetAsmInfo &TAI = context.getTargetAsmInfo();
+  const MCRegisterInfo &MRI = context.getRegisterInfo();
+  const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
   bool verboseAsm = streamer.isVerboseAsm();
 
   MCSymbol *sectionStart;
-  if (TAI.isFunctionEHFrameSymbolPrivate() || !IsEH)
+  if (MOFI->isFunctionEHFrameSymbolPrivate() || !IsEH)
     sectionStart = context.CreateTempSymbol();
   else
     sectionStart = context.GetOrCreateSymbol(Twine("EH_frame") + Twine(CIENum));
@@ -824,7 +822,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
 
   // Return Address Register
   if (verboseAsm) streamer.AddComment("CIE Return Address Column");
-  streamer.EmitULEB128IntValue(TAI.getDwarfRARegNum(true));
+  streamer.EmitULEB128IntValue(MRI.getDwarfRegNum(MRI.getRARegister(), true));
 
   // Augmentation Data Length (optional)
 
@@ -858,21 +856,22 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
       EmitEncodingByte(streamer, lsdaEncoding, "LSDA Encoding");
 
     // Encoding of the FDE pointers
-    EmitEncodingByte(streamer, TAI.getFDEEncoding(UsingCFI),
+    EmitEncodingByte(streamer, MOFI->getFDEEncoding(UsingCFI),
                      "FDE Encoding");
   }
 
   // Initial Instructions
 
-  const std::vector<MachineMove> &Moves = TAI.getInitialFrameState();
+  const MCAsmInfo &MAI = context.getAsmInfo();
+  const std::vector<MachineMove> &Moves = MAI.getInitialFrameState();
   std::vector<MCCFIInstruction> Instructions;
 
   for (int i = 0, n = Moves.size(); i != n; ++i) {
     MCSymbol *Label = Moves[i].getLabel();
     const MachineLocation &Dst =
-      TranslateMachineLocation(TAI, Moves[i].getDestination());
+      TranslateMachineLocation(MRI, Moves[i].getDestination());
     const MachineLocation &Src =
-      TranslateMachineLocation(TAI, Moves[i].getSource());
+      TranslateMachineLocation(MRI, Moves[i].getSource());
     MCCFIInstruction Inst(Label, Dst, Src);
     Instructions.push_back(Inst);
   }
@@ -893,10 +892,10 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
   MCContext &context = streamer.getContext();
   MCSymbol *fdeStart = context.CreateTempSymbol();
   MCSymbol *fdeEnd = context.CreateTempSymbol();
-  const TargetAsmInfo &TAI = context.getTargetAsmInfo();
+  const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
   bool verboseAsm = streamer.isVerboseAsm();
 
-  if (!TAI.isFunctionEHFrameSymbolPrivate() && IsEH) {
+  if (IsEH && frame.Function && !MOFI->isFunctionEHFrameSymbolPrivate()) {
     MCSymbol *EHSym =
       context.GetOrCreateSymbol(frame.Function->getName() + Twine(".eh"));
     streamer.EmitEHSymAttributes(frame.Function, EHSym);
@@ -925,7 +924,7 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
     streamer.EmitSymbolValue(&cieStart, 4);
   }
 
-  unsigned fdeEncoding = TAI.getFDEEncoding(UsingCFI);
+  unsigned fdeEncoding = MOFI->getFDEEncoding(UsingCFI);
   unsigned size = getSizeForEncoding(streamer, fdeEncoding);
 
   // PC Begin
@@ -1011,26 +1010,34 @@ void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer,
                                bool UsingCFI,
                                bool IsEH) {
   MCContext &Context = Streamer.getContext();
-  const TargetAsmInfo &TAI = Context.getTargetAsmInfo();
-  const MCSection &Section = IsEH ? *TAI.getEHFrameSection() :
-                                    *TAI.getDwarfFrameSection();
+  MCObjectFileInfo *MOFI =
+    const_cast<MCObjectFileInfo*>(Context.getObjectFileInfo());
+  FrameEmitterImpl Emitter(UsingCFI, IsEH);
+  ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getFrameInfos();
+
+  // Emit the compact unwind info if available.
+  // FIXME: This emits both the compact unwind and the old CIE/FDE
+  //        information. Only one of those is needed.
+  if (IsEH && MOFI->getCompactUnwindSection())
+    for (unsigned i = 0, n = Streamer.getNumFrameInfos(); i < n; ++i) {
+      const MCDwarfFrameInfo &Frame = Streamer.getFrameInfo(i);
+      if (!Frame.CompactUnwindEncoding)
+        Emitter.EmitCompactUnwind(Streamer, Frame);
+    }
+
+  const MCSection &Section = IsEH ? *MOFI->getEHFrameSection() :
+                                    *MOFI->getDwarfFrameSection();
   Streamer.SwitchSection(&Section);
   MCSymbol *SectionStart = Context.CreateTempSymbol();
   Streamer.EmitLabel(SectionStart);
+  Emitter.setSectionStart(SectionStart);
 
   MCSymbol *FDEEnd = NULL;
   DenseMap<CIEKey, const MCSymbol*> CIEStarts;
-  FrameEmitterImpl Emitter(UsingCFI, IsEH, SectionStart);
 
   const MCSymbol *DummyDebugKey = NULL;
-  for (unsigned i = 0, n = Streamer.getNumFrameInfos(); i < n; ++i) {
-    const MCDwarfFrameInfo &Frame = Streamer.getFrameInfo(i);
-    if (IsEH && TAI.getCompactUnwindSection() &&
-        Emitter.EmitCompactUnwind(Streamer, Frame)) {
-      FDEEnd = NULL;
-      continue;
-    }
-
+  for (unsigned i = 0, n = FrameArray.size(); i < n; ++i) {
+    const MCDwarfFrameInfo &Frame = FrameArray[i];
     CIEKey Key(Frame.Personality, Frame.PersonalityEncoding,
                Frame.LsdaEncoding);
     const MCSymbol *&CIEStart = IsEH ? CIEStarts[Key] : DummyDebugKey;
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index 2c3f8e8..dad2e7b 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -16,7 +16,6 @@
 #include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/Support/ELF.h"
-#include "llvm/Target/TargetAsmBackend.h"
 
 namespace llvm {
 
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 49340ed..9ada08e 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -21,11 +21,11 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmBackend.h"
 
 using namespace llvm;
 
@@ -53,8 +53,9 @@ void MCELFStreamer::EmitLabel(MCSymbol *Symbol) {
 void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   switch (Flag) {
   case MCAF_SyntaxUnified: return; // no-op here.
-  case MCAF_Code16: return; // no-op here.
-  case MCAF_Code32: return; // no-op here.
+  case MCAF_Code16: return; // Change parsing mode; no-op here.
+  case MCAF_Code32: return; // Change parsing mode; no-op here.
+  case MCAF_Code64: return; // Change parsing mode; no-op here.
   case MCAF_SubsectionsViaSymbols:
     getAssembler().setSubsectionsViaSymbols(true);
     return;
@@ -219,14 +220,14 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   SD.setSize(MCConstantExpr::Create(Size, getContext()));
 }
 
-void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                          unsigned ByteAlignment) {
   // FIXME: Should this be caught and done earlier?
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
   MCELF::SetBinding(SD, ELF::STB_LOCAL);
   SD.setExternal(false);
   BindingExplicitlySet.insert(Symbol);
-  // FIXME: ByteAlignment is not needed here, but is required.
-  EmitCommonSymbol(Symbol, Size, 1);
+  EmitCommonSymbol(Symbol, Size, ByteAlignment);
 }
 
 void MCELFStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
@@ -374,10 +375,10 @@ void MCELFStreamer::Finish() {
   this->MCObjectStreamer::Finish();
 }
 
-MCStreamer *llvm::createELFStreamer(MCContext &Context, TargetAsmBackend &TAB,
+MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                     raw_ostream &OS, MCCodeEmitter *CE,
                                     bool RelaxAll, bool NoExecStack) {
-  MCELFStreamer *S = new MCELFStreamer(Context, TAB, OS, CE);
+  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   if (NoExecStack)
diff --git a/lib/MC/MCELFStreamer.h b/lib/MC/MCELFStreamer.h
index 855e7e9..10bf775 100644
--- a/lib/MC/MCELFStreamer.h
+++ b/lib/MC/MCELFStreamer.h
@@ -25,11 +25,11 @@ namespace llvm {
 
 class MCELFStreamer : public MCObjectStreamer {
 public:
-  MCELFStreamer(MCContext &Context, TargetAsmBackend &TAB,
+  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                   raw_ostream &OS, MCCodeEmitter *Emitter)
     : MCObjectStreamer(Context, TAB, OS, Emitter) {}
 
-  MCELFStreamer(MCContext &Context, TargetAsmBackend &TAB,
+  MCELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                 raw_ostream &OS, MCCodeEmitter *Emitter,
                 MCAssembler *Assembler)
     : MCObjectStreamer(Context, TAB, OS, Emitter, Assembler) {}
@@ -74,7 +74,8 @@ public:
      SD.setSize(Value);
   }
 
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size);
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             unsigned Size = 0, unsigned ByteAlignment = 0) {
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index fcf1aab..da297fb 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -18,7 +18,6 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 81a939f..2317a28 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -8,7 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 MCInstPrinter::~MCInstPrinter() {
@@ -23,3 +25,12 @@ StringRef MCInstPrinter::getOpcodeName(unsigned Opcode) const {
 void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   assert(0 && "Target should implement this");
 }
+
+void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
+  if (!Annot.empty()) {
+    if (CommentStream)
+      (*CommentStream) << Annot;
+    else
+      OS << " " << MAI.getCommentString() << " " << Annot;
+  }
+}
diff --git a/lib/MC/MCInstrAnalysis.cpp b/lib/MC/MCInstrAnalysis.cpp
new file mode 100644
index 0000000..7736702
--- /dev/null
+++ b/lib/MC/MCInstrAnalysis.cpp
@@ -0,0 +1,21 @@
+//===-- MCInstrAnalysis.cpp - InstrDesc target hooks ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstrAnalysis.h"
+using namespace llvm;
+
+uint64_t MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                                         uint64_t Size) const {
+  if (Inst.getNumOperands() == 0 ||
+      Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
+    return -1ULL;
+
+  int64_t Imm = Inst.getOperand(0).getImm();
+  return Addr+Size+Imm;
+}
diff --git a/lib/MC/MCLoggingStreamer.cpp b/lib/MC/MCLoggingStreamer.cpp
index 309752e..3fe8ac7 100644
--- a/lib/MC/MCLoggingStreamer.cpp
+++ b/lib/MC/MCLoggingStreamer.cpp
@@ -133,9 +133,10 @@ public:
     return Child->EmitCommonSymbol(Symbol, Size, ByteAlignment);
   }
 
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment) {
     LogCall("EmitLocalCommonSymbol");
-    return Child->EmitLocalCommonSymbol(Symbol, Size);
+    return Child->EmitLocalCommonSymbol(Symbol, Size, ByteAlignment);
   }
   
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 1b21249..aa35815 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -20,10 +20,10 @@
 #include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmBackend.h"
 
 using namespace llvm;
 
@@ -34,9 +34,9 @@ private:
   virtual void EmitInstToData(const MCInst &Inst);
 
 public:
-  MCMachOStreamer(MCContext &Context, TargetAsmBackend &TAB,
+  MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB,
                   raw_ostream &OS, MCCodeEmitter *Emitter)
-    : MCObjectStreamer(Context, TAB, OS, Emitter) {}
+    : MCObjectStreamer(Context, MAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
@@ -67,7 +67,8 @@ public:
   virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
     assert(0 && "macho doesn't support this directive");
   }
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment) {
     assert(0 && "macho doesn't support this directive");
   }
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
@@ -143,8 +144,9 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   // Do any generic stuff we need to do.
   switch (Flag) {
   case MCAF_SyntaxUnified: return; // no-op here.
-  case MCAF_Code16: return; // no-op here.
-  case MCAF_Code32: return; // no-op here.
+  case MCAF_Code16: return; // Change parsing mode; no-op here.
+  case MCAF_Code32: return; // Change parsing mode; no-op here.
+  case MCAF_Code64: return; // Change parsing mode; no-op here.
   case MCAF_SubsectionsViaSymbols:
     getAssembler().setSubsectionsViaSymbols(true);
     return;
@@ -207,8 +209,8 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_ELF_TypeCommon:
   case MCSA_ELF_TypeNoType:
   case MCSA_ELF_TypeGnuUniqueObject:
-  case MCSA_IndirectSymbol:
   case MCSA_Hidden:
+  case MCSA_IndirectSymbol:
   case MCSA_Internal:
   case MCSA_Protected:
   case MCSA_Weak:
@@ -410,10 +412,10 @@ void MCMachOStreamer::Finish() {
   this->MCObjectStreamer::Finish();
 }
 
-MCStreamer *llvm::createMachOStreamer(MCContext &Context, TargetAsmBackend &TAB,
+MCStreamer *llvm::createMachOStreamer(MCContext &Context, MCAsmBackend &MAB,
                                       raw_ostream &OS, MCCodeEmitter *CE,
                                       bool RelaxAll) {
-  MCMachOStreamer *S = new MCMachOStreamer(Context, TAB, OS, CE);
+  MCMachOStreamer *S = new MCMachOStreamer(Context, MAB, OS, CE);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCModule.cpp
new file mode 100644
index 0000000..b1d09d9
--- /dev/null
+++ b/lib/MC/MCModule.cpp
@@ -0,0 +1,45 @@
+//===- lib/MC/MCModule.cpp - MCModule implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCModule.h"
+
+using namespace llvm;
+
+MCAtom *MCModule::createAtom(MCAtom::AtomType Type,
+                             uint64_t Begin, uint64_t End) {
+  assert(Begin < End && "Creating MCAtom with endpoints reversed?");
+
+  // Check for atoms already covering this range.
+  IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Begin);
+  assert((!I.valid() || I.start() < End) && "Offset range already occupied!");
+
+  // Create the new atom and add it to our maps.
+  MCAtom *NewAtom = new MCAtom(Type, this, Begin, End);
+  AtomAllocationTracker.insert(NewAtom);
+  OffsetMap.insert(Begin, End, NewAtom);
+  return NewAtom;
+}
+
+// remap - Update the interval mapping for an atom.
+void MCModule::remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd) {
+  // Find and erase the old mapping.
+  IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Atom->Begin);
+  assert(I.valid() && "Atom offset not found in module!");
+  assert(*I == Atom && "Previous atom mapping was invalid!");
+  I.erase();
+
+  // Insert the new mapping.
+  OffsetMap.insert(NewBegin, NewEnd, Atom);
+
+  // Update the atom internal bounds.
+  Atom->Begin = NewBegin;
+  Atom->End = NewEnd;
+}
+
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 9577af0..a6c0adb 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -59,8 +59,8 @@ namespace {
     virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
     virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                   unsigned ByteAlignment) {}
-    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {}
-
+    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {}
     virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                               unsigned Size = 0, unsigned ByteAlignment = 0) {}
     virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
new file mode 100644
index 0000000..df8b99d
--- /dev/null
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -0,0 +1,554 @@
+//===-- MObjectFileInfo.cpp - Object File Information ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/ADT/Triple.h"
+using namespace llvm;
+
+void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
+  // MachO
+  IsFunctionEHFrameSymbolPrivate = false;
+  SupportsWeakOmittedEHFrame = false;
+
+  PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel
+    | dwarf::DW_EH_PE_sdata4;
+  LSDAEncoding = FDEEncoding = FDECFIEncoding = dwarf::DW_EH_PE_pcrel;
+  TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+    dwarf::DW_EH_PE_sdata4;
+
+  // .comm doesn't support alignment before Leopard.
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 5))
+    CommDirectiveSupportsAlignment = false;
+
+  TextSection // .text
+    = Ctx->getMachOSection("__TEXT", "__text",
+                           MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                           SectionKind::getText());
+  DataSection // .data
+    = Ctx->getMachOSection("__DATA", "__data", 0,
+                           SectionKind::getDataRel());
+
+  TLSDataSection // .tdata
+    = Ctx->getMachOSection("__DATA", "__thread_data",
+                           MCSectionMachO::S_THREAD_LOCAL_REGULAR,
+                           SectionKind::getDataRel());
+  TLSBSSSection // .tbss
+    = Ctx->getMachOSection("__DATA", "__thread_bss",
+                           MCSectionMachO::S_THREAD_LOCAL_ZEROFILL,
+                           SectionKind::getThreadBSS());
+
+  // TODO: Verify datarel below.
+  TLSTLVSection // .tlv
+    = Ctx->getMachOSection("__DATA", "__thread_vars",
+                           MCSectionMachO::S_THREAD_LOCAL_VARIABLES,
+                           SectionKind::getDataRel());
+
+  TLSThreadInitSection
+    = Ctx->getMachOSection("__DATA", "__thread_init",
+                           MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS,
+                           SectionKind::getDataRel());
+
+  CStringSection // .cstring
+    = Ctx->getMachOSection("__TEXT", "__cstring",
+                           MCSectionMachO::S_CSTRING_LITERALS,
+                           SectionKind::getMergeable1ByteCString());
+  UStringSection
+    = Ctx->getMachOSection("__TEXT","__ustring", 0,
+                           SectionKind::getMergeable2ByteCString());
+  FourByteConstantSection // .literal4
+    = Ctx->getMachOSection("__TEXT", "__literal4",
+                           MCSectionMachO::S_4BYTE_LITERALS,
+                           SectionKind::getMergeableConst4());
+  EightByteConstantSection // .literal8
+    = Ctx->getMachOSection("__TEXT", "__literal8",
+                           MCSectionMachO::S_8BYTE_LITERALS,
+                           SectionKind::getMergeableConst8());
+
+  // ld_classic doesn't support .literal16 in 32-bit mode, and ld64 falls back
+  // to using it in -static mode.
+  SixteenByteConstantSection = 0;
+  if (RelocM != Reloc::Static &&
+      T.getArch() != Triple::x86_64 && T.getArch() != Triple::ppc64)
+    SixteenByteConstantSection =   // .literal16
+      Ctx->getMachOSection("__TEXT", "__literal16",
+                           MCSectionMachO::S_16BYTE_LITERALS,
+                           SectionKind::getMergeableConst16());
+
+  ReadOnlySection  // .const
+    = Ctx->getMachOSection("__TEXT", "__const", 0,
+                           SectionKind::getReadOnly());
+
+  TextCoalSection
+    = Ctx->getMachOSection("__TEXT", "__textcoal_nt",
+                           MCSectionMachO::S_COALESCED |
+                           MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                           SectionKind::getText());
+  ConstTextCoalSection
+    = Ctx->getMachOSection("__TEXT", "__const_coal",
+                           MCSectionMachO::S_COALESCED,
+                           SectionKind::getReadOnly());
+  ConstDataSection  // .const_data
+    = Ctx->getMachOSection("__DATA", "__const", 0,
+                           SectionKind::getReadOnlyWithRel());
+  DataCoalSection
+    = Ctx->getMachOSection("__DATA","__datacoal_nt",
+                           MCSectionMachO::S_COALESCED,
+                           SectionKind::getDataRel());
+  DataCommonSection
+    = Ctx->getMachOSection("__DATA","__common",
+                           MCSectionMachO::S_ZEROFILL,
+                           SectionKind::getBSS());
+  DataBSSSection
+    = Ctx->getMachOSection("__DATA","__bss", MCSectionMachO::S_ZEROFILL,
+                           SectionKind::getBSS());
+
+
+  LazySymbolPointerSection
+    = Ctx->getMachOSection("__DATA", "__la_symbol_ptr",
+                           MCSectionMachO::S_LAZY_SYMBOL_POINTERS,
+                           SectionKind::getMetadata());
+  NonLazySymbolPointerSection
+    = Ctx->getMachOSection("__DATA", "__nl_symbol_ptr",
+                           MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                           SectionKind::getMetadata());
+
+  if (RelocM == Reloc::Static) {
+    StaticCtorSection
+      = Ctx->getMachOSection("__TEXT", "__constructor", 0,
+                             SectionKind::getDataRel());
+    StaticDtorSection
+      = Ctx->getMachOSection("__TEXT", "__destructor", 0,
+                             SectionKind::getDataRel());
+  } else {
+    StaticCtorSection
+      = Ctx->getMachOSection("__DATA", "__mod_init_func",
+                             MCSectionMachO::S_MOD_INIT_FUNC_POINTERS,
+                             SectionKind::getDataRel());
+    StaticDtorSection
+      = Ctx->getMachOSection("__DATA", "__mod_term_func",
+                             MCSectionMachO::S_MOD_TERM_FUNC_POINTERS,
+                             SectionKind::getDataRel());
+  }
+
+  // Exception Handling.
+  LSDASection = Ctx->getMachOSection("__TEXT", "__gcc_except_tab", 0,
+                                     SectionKind::getReadOnlyWithRel());
+
+  if (T.isMacOSX() && !T.isMacOSXVersionLT(10, 6))
+    CompactUnwindSection =
+      Ctx->getMachOSection("__LD", "__compact_unwind",
+                           MCSectionMachO::S_ATTR_DEBUG,
+                           SectionKind::getReadOnly());
+
+  // Debug Information.
+  DwarfAbbrevSection =
+    Ctx->getMachOSection("__DWARF", "__debug_abbrev",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfInfoSection =
+    Ctx->getMachOSection("__DWARF", "__debug_info",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfLineSection =
+    Ctx->getMachOSection("__DWARF", "__debug_line",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfFrameSection =
+    Ctx->getMachOSection("__DWARF", "__debug_frame",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_pubnames",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_pubtypes",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfStrSection =
+    Ctx->getMachOSection("__DWARF", "__debug_str",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfLocSection =
+    Ctx->getMachOSection("__DWARF", "__debug_loc",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfARangesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_aranges",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfRangesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_ranges",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    Ctx->getMachOSection("__DWARF", "__debug_macinfo",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfDebugInlineSection =
+    Ctx->getMachOSection("__DWARF", "__debug_inlined",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+
+  TLSExtraDataSection = TLSTLVSection;
+}
+
+void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
+  if (T.getArch() == Triple::x86) {
+    PersonalityEncoding = (RelocM == Reloc::PIC_)
+      ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      : dwarf::DW_EH_PE_absptr;
+    LSDAEncoding = (RelocM == Reloc::PIC_)
+      ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      : dwarf::DW_EH_PE_absptr;
+    FDEEncoding = FDECFIEncoding = (RelocM == Reloc::PIC_)
+      ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      : dwarf::DW_EH_PE_absptr;
+    TTypeEncoding = (RelocM == Reloc::PIC_)
+      ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      : dwarf::DW_EH_PE_absptr;
+  } else if (T.getArch() == Triple::x86_64) {
+    FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+
+    if (RelocM == Reloc::PIC_) {
+      PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        ((CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+      LSDAEncoding = dwarf::DW_EH_PE_pcrel |
+        (CMModel == CodeModel::Small
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+      FDEEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
+      TTypeEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
+        ((CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
+         ? dwarf::DW_EH_PE_sdata4 : dwarf::DW_EH_PE_sdata8);
+    } else {
+      PersonalityEncoding =
+        (CMModel == CodeModel::Small || CMModel == CodeModel::Medium)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+      LSDAEncoding = (CMModel == CodeModel::Small)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+      FDEEncoding = dwarf::DW_EH_PE_udata4;
+      TTypeEncoding = (CMModel == CodeModel::Small)
+        ? dwarf::DW_EH_PE_udata4 : dwarf::DW_EH_PE_absptr;
+    }
+  }
+
+  // ELF
+  BSSSection =
+    Ctx->getELFSection(".bss", ELF::SHT_NOBITS,
+                       ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                       SectionKind::getBSS());
+
+  TextSection =
+    Ctx->getELFSection(".text", ELF::SHT_PROGBITS,
+                       ELF::SHF_EXECINSTR |
+                       ELF::SHF_ALLOC,
+                       SectionKind::getText());
+
+  DataSection =
+    Ctx->getELFSection(".data", ELF::SHT_PROGBITS,
+                       ELF::SHF_WRITE |ELF::SHF_ALLOC,
+                       SectionKind::getDataRel());
+
+  ReadOnlySection =
+    Ctx->getELFSection(".rodata", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC,
+                       SectionKind::getReadOnly());
+
+  TLSDataSection =
+    Ctx->getELFSection(".tdata", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC | ELF::SHF_TLS |
+                       ELF::SHF_WRITE,
+                       SectionKind::getThreadData());
+
+  TLSBSSSection =
+    Ctx->getELFSection(".tbss", ELF::SHT_NOBITS,
+                       ELF::SHF_ALLOC | ELF::SHF_TLS |
+                       ELF::SHF_WRITE,
+                       SectionKind::getThreadBSS());
+
+  DataRelSection =
+    Ctx->getELFSection(".data.rel", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRel());
+
+  DataRelLocalSection =
+    Ctx->getELFSection(".data.rel.local", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRelLocal());
+
+  DataRelROSection =
+    Ctx->getELFSection(".data.rel.ro", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getReadOnlyWithRel());
+
+  DataRelROLocalSection =
+    Ctx->getELFSection(".data.rel.ro.local", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getReadOnlyWithRelLocal());
+
+  MergeableConst4Section =
+    Ctx->getELFSection(".rodata.cst4", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
+                       SectionKind::getMergeableConst4());
+
+  MergeableConst8Section =
+    Ctx->getELFSection(".rodata.cst8", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
+                       SectionKind::getMergeableConst8());
+
+  MergeableConst16Section =
+    Ctx->getELFSection(".rodata.cst16", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_MERGE,
+                       SectionKind::getMergeableConst16());
+
+  StaticCtorSection =
+    Ctx->getELFSection(".ctors", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRel());
+
+  StaticDtorSection =
+    Ctx->getELFSection(".dtors", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC |ELF::SHF_WRITE,
+                       SectionKind::getDataRel());
+
+  // Exception Handling Sections.
+
+  // FIXME: We're emitting LSDA info into a readonly section on ELF, even though
+  // it contains relocatable pointers.  In PIC mode, this is probably a big
+  // runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    Ctx->getELFSection(".gcc_except_table", ELF::SHT_PROGBITS,
+                       ELF::SHF_ALLOC,
+                       SectionKind::getReadOnly());
+
+  // Debug Info Sections.
+  DwarfAbbrevSection =
+    Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfInfoSection =
+    Ctx->getELFSection(".debug_info", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfLineSection =
+    Ctx->getELFSection(".debug_line", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfFrameSection =
+    Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfStrSection =
+    Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfLocSection =
+    Ctx->getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfARangesSection =
+    Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfRangesSection =
+    Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+}
+
+
+void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
+  // COFF
+  TextSection =
+    Ctx->getCOFFSection(".text",
+                        COFF::IMAGE_SCN_CNT_CODE |
+                        COFF::IMAGE_SCN_MEM_EXECUTE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getText());
+  DataSection =
+    Ctx->getCOFFSection(".data",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getDataRel());
+  ReadOnlySection =
+    Ctx->getCOFFSection(".rdata",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getReadOnly());
+  StaticCtorSection =
+    Ctx->getCOFFSection(".ctors",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getDataRel());
+  StaticDtorSection =
+    Ctx->getCOFFSection(".dtors",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getDataRel());
+
+  // FIXME: We're emitting LSDA info into a readonly section on COFF, even
+  // though it contains relocatable pointers.  In PIC mode, this is probably a
+  // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    Ctx->getCOFFSection(".gcc_except_table",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getReadOnly());
+
+  // Debug info.
+  DwarfAbbrevSection =
+    Ctx->getCOFFSection(".debug_abbrev",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfInfoSection =
+    Ctx->getCOFFSection(".debug_info",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfLineSection =
+    Ctx->getCOFFSection(".debug_line",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfFrameSection =
+    Ctx->getCOFFSection(".debug_frame",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    Ctx->getCOFFSection(".debug_pubnames",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    Ctx->getCOFFSection(".debug_pubtypes",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfStrSection =
+    Ctx->getCOFFSection(".debug_str",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfLocSection =
+    Ctx->getCOFFSection(".debug_loc",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfARangesSection =
+    Ctx->getCOFFSection(".debug_aranges",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfRangesSection =
+    Ctx->getCOFFSection(".debug_ranges",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    Ctx->getCOFFSection(".debug_macinfo",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+
+  DrectveSection =
+    Ctx->getCOFFSection(".drectve",
+                        COFF::IMAGE_SCN_LNK_INFO,
+                        SectionKind::getMetadata());
+
+  PDataSection =
+    Ctx->getCOFFSection(".pdata",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getDataRel());
+
+  XDataSection =
+    Ctx->getCOFFSection(".xdata",
+                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getDataRel());
+}
+
+void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
+                                            CodeModel::Model cm,
+                                            MCContext &ctx) {
+  RelocM = relocm;
+  CMModel = cm;
+  Ctx = &ctx;
+
+  // Common.
+  CommDirectiveSupportsAlignment = true;
+  SupportsWeakOmittedEHFrame = true;
+  IsFunctionEHFrameSymbolPrivate = true;
+
+  PersonalityEncoding = LSDAEncoding = FDEEncoding = FDECFIEncoding =
+    TTypeEncoding = dwarf::DW_EH_PE_absptr;
+
+  EHFrameSection = 0;           // Created on demand.
+  CompactUnwindSection = 0;     // Used only by selected targets.
+
+  Triple T(TT);
+  Triple::ArchType Arch = T.getArch();
+  // FIXME: Checking for Arch here to filter out bogus triples such as
+  // cellspu-apple-darwin. Perhaps we should fix in Triple?
+  if ((Arch == Triple::x86 || Arch == Triple::x86_64 ||
+       Arch == Triple::arm || Arch == Triple::thumb ||
+       Arch == Triple::ppc || Arch == Triple::ppc64 ||
+       Arch == Triple::UnknownArch) &&
+      (T.isOSDarwin() || T.getEnvironment() == Triple::MachO)) {
+    Env = IsMachO;
+    InitMachOMCObjectFileInfo(T);
+  } else if ((Arch == Triple::x86 || Arch == Triple::x86_64) &&
+             (T.getOS() == Triple::MinGW32 || T.getOS() == Triple::Cygwin ||
+              T.getOS() == Triple::Win32)) {
+    Env = IsCOFF;
+    InitCOFFMCObjectFileInfo(T);
+  } else {
+    Env = IsELF;
+    InitELFMCObjectFileInfo(T);
+  }
+}
+
+void MCObjectFileInfo::InitEHFrameSection() {
+  if (Env == IsMachO)
+    EHFrameSection =
+      Ctx->getMachOSection("__TEXT", "__eh_frame",
+                           MCSectionMachO::S_COALESCED |
+                           MCSectionMachO::S_ATTR_NO_TOC |
+                           MCSectionMachO::S_ATTR_STRIP_STATIC_SYMS |
+                           MCSectionMachO::S_ATTR_LIVE_SUPPORT,
+                           SectionKind::getReadOnly());
+  else if (Env == IsELF)
+    EHFrameSection =
+      Ctx->getELFSection(".eh_frame", ELF::SHT_PROGBITS,
+                         ELF::SHF_ALLOC,
+                         SectionKind::getDataRel());
+  else
+    EHFrameSection =
+      Ctx->getCOFFSection(".eh_frame",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ |
+                          COFF::IMAGE_SCN_MEM_WRITE,
+                          SectionKind::getDataRel());
+}
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 8635aac..a04ae08 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -17,10 +17,10 @@
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetAsmBackend.h"
+#include "llvm/MC/MCAsmBackend.h"
 using namespace llvm;
 
-MCObjectStreamer::MCObjectStreamer(MCContext &Context, TargetAsmBackend &TAB,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
                                    raw_ostream &OS, MCCodeEmitter *Emitter_)
   : MCStreamer(Context),
     Assembler(new MCAssembler(Context, TAB,
@@ -30,7 +30,7 @@ MCObjectStreamer::MCObjectStreamer(MCContext &Context, TargetAsmBackend &TAB,
 {
 }
 
-MCObjectStreamer::MCObjectStreamer(MCContext &Context, TargetAsmBackend &TAB,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context, MCAsmBackend &TAB,
                                    raw_ostream &OS, MCCodeEmitter *Emitter_,
                                    MCAssembler *_Assembler)
   : MCStreamer(Context), Assembler(_Assembler), CurSectionData(0)
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 0c1f8f0..c76052d 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -24,6 +24,7 @@ using namespace llvm;
 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
   CurBuf = NULL;
   CurPtr = NULL;
+  isAtStartOfLine = true;
 }
 
 AsmLexer::~AsmLexer() {
@@ -146,7 +147,7 @@ AsmToken AsmLexer::LexLineComment() {
   // FIXME: This is broken if we happen to a comment at the end of a file, which
   // was .included, and which doesn't end with a newline.
   int CurChar = getNextChar();
-  while (CurChar != '\n' && CurChar != '\n' && CurChar != EOF)
+  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
     CurChar = getNextChar();
 
   if (CurChar == EOF)
@@ -334,6 +335,17 @@ StringRef AsmLexer::LexUntilEndOfStatement() {
   return StringRef(TokStart, CurPtr-TokStart);
 }
 
+StringRef AsmLexer::LexUntilEndOfLine() {
+  TokStart = CurPtr;
+
+  while (*CurPtr != '\n' &&
+         *CurPtr != '\r' &&
+         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
+    ++CurPtr;
+  }
+  return StringRef(TokStart, CurPtr-TokStart);
+}
+
 bool AsmLexer::isAtStartOfComment(char Char) {
   // FIXME: This won't work for multi-character comment indicators like "//".
   return Char == *MAI.getCommentString();
@@ -349,14 +361,29 @@ AsmToken AsmLexer::LexToken() {
   // This always consumes at least one character.
   int CurChar = getNextChar();
 
-  if (isAtStartOfComment(CurChar))
+  if (isAtStartOfComment(CurChar)) {
+    // If this comment starts with a '#', then return the Hash token and let
+    // the assembler parser see if it can be parsed as a cpp line filename
+    // comment. We do this only if we are at the start of a line.
+    if (CurChar == '#' && isAtStartOfLine)
+      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
+    isAtStartOfLine = true;
     return LexLineComment();
+  }
   if (isAtStatementSeparator(TokStart)) {
     CurPtr += strlen(MAI.getSeparatorString()) - 1;
     return AsmToken(AsmToken::EndOfStatement,
                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
   }
 
+  // If we're missing a newline at EOF, make sure we still get an
+  // EndOfStatement token before the Eof token.
+  if (CurChar == EOF && !isAtStartOfLine) {
+    isAtStartOfLine = true;
+    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
+  }
+
+  isAtStartOfLine = false;
   switch (CurChar) {
   default:
     // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
@@ -373,6 +400,7 @@ AsmToken AsmLexer::LexToken() {
     return LexToken();
   case '\n': // FALL THROUGH.
   case '\r':
+    isAtStartOfLine = true;
     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 0c181f3..1648757 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -18,22 +18,22 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/AsmCond.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetAsmParser.h"
 #include <cctype>
 #include <vector>
 using namespace llvm;
@@ -87,6 +87,8 @@ private:
   MCStreamer &Out;
   const MCAsmInfo &MAI;
   SourceMgr &SrcMgr;
+  SourceMgr::DiagHandlerTy SavedDiagHandler;
+  void *SavedDiagContext;
   MCAsmParserExtension *GenericParser;
   MCAsmParserExtension *PlatformParser;
 
@@ -115,8 +117,13 @@ private:
   /// Flag tracking whether any errors have been encountered.
   unsigned HadError : 1;
 
+  /// The values from the last parsed cpp hash file line comment if any.
+  StringRef CppHashFilename;
+  int64_t CppHashLineNumber;
+  SMLoc CppHashLoc;
+
 public:
-  AsmParser(const Target &T, SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
+  AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
             const MCAsmInfo &MAI);
   ~AsmParser();
 
@@ -153,6 +160,8 @@ private:
   void CheckForValidSection();
 
   bool ParseStatement();
+  void EatToEndOfLine();
+  bool ParseCppHashLineFilenameComment(const SMLoc &L);
 
   bool HandleMacroEntry(StringRef Name, SMLoc NameLoc, const Macro *M);
   bool expandMacro(SmallString<256> &Buf, StringRef Body,
@@ -166,6 +175,7 @@ private:
                     bool ShowLine = true) const {
     SrcMgr.PrintMessage(Loc, Msg, Type, ShowLine);
   }
+  static void DiagHandler(const SMDiagnostic &Diag, void *Context);
 
   /// EnterIncludeFile - Enter the specified file. This returns true on failure.
   bool EnterIncludeFile(const std::string &Filename);
@@ -338,11 +348,16 @@ extern MCAsmParserExtension *createCOFFAsmParser();
 
 enum { DEFAULT_ADDRSPACE = 0 };
 
-AsmParser::AsmParser(const Target &T, SourceMgr &_SM, MCContext &_Ctx,
+AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
                      MCStreamer &_Out, const MCAsmInfo &_MAI)
   : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
     GenericParser(new GenericAsmParser), PlatformParser(0),
-    CurBuffer(0), MacrosEnabled(true) {
+    CurBuffer(0), MacrosEnabled(true), CppHashLineNumber(0) {
+  // Save the old handler.
+  SavedDiagHandler = SrcMgr.getDiagHandler();
+  SavedDiagContext = SrcMgr.getDiagContext();
+  // Set our own handler which calls the saved handler.
+  SrcMgr.setDiagHandler(DiagHandler, this);
   Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer));
 
   // Initialize the generic parser.
@@ -740,9 +755,12 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 
 /// ParseExpression - Parse an expression and return it.
 ///
-///  expr ::= expr +,- expr          -> lowest.
-///  expr ::= expr |,^,&,! expr      -> middle.
-///  expr ::= expr *,/,%,<<,>> expr  -> highest.
+///  expr ::= expr &&,|| expr               -> lowest.
+///  expr ::= expr |,^,&,! expr
+///  expr ::= expr ==,!=,<>,<,<=,>,>= expr
+///  expr ::= expr <<,>> expr
+///  expr ::= expr +,- expr
+///  expr ::= expr *,/,% expr               -> highest.
 ///  expr ::= primaryexpr
 ///
 bool AsmParser::ParseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
@@ -809,7 +827,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
   default:
     return 0;    // not a binop.
 
-    // Lowest Precedence: &&, ||, @
+    // Lowest Precedence: &&, ||
   case AsmToken::AmpAmp:
     Kind = MCBinaryExpr::LAnd;
     return 1;
@@ -852,30 +870,32 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::GTE;
     return 3;
 
+    // Intermediate Precedence: <<, >>
+  case AsmToken::LessLess:
+    Kind = MCBinaryExpr::Shl;
+    return 4;
+  case AsmToken::GreaterGreater:
+    Kind = MCBinaryExpr::Shr;
+    return 4;
+
     // High Intermediate Precedence: +, -
   case AsmToken::Plus:
     Kind = MCBinaryExpr::Add;
-    return 4;
+    return 5;
   case AsmToken::Minus:
     Kind = MCBinaryExpr::Sub;
-    return 4;
+    return 5;
 
-    // Highest Precedence: *, /, %, <<, >>
+    // Highest Precedence: *, /, %
   case AsmToken::Star:
     Kind = MCBinaryExpr::Mul;
-    return 5;
+    return 6;
   case AsmToken::Slash:
     Kind = MCBinaryExpr::Div;
-    return 5;
+    return 6;
   case AsmToken::Percent:
     Kind = MCBinaryExpr::Mod;
-    return 5;
-  case AsmToken::LessLess:
-    Kind = MCBinaryExpr::Shl;
-    return 5;
-  case AsmToken::GreaterGreater:
-    Kind = MCBinaryExpr::Shr;
-    return 5;
+    return 6;
   }
 }
 
@@ -932,10 +952,8 @@ bool AsmParser::ParseStatement() {
   StringRef IDVal;
   int64_t LocalLabelVal = -1;
   // A full line comment is a '#' as the first token.
-  if (Lexer.is(AsmToken::Hash)) {
-    EatToEndOfStatement();
-    return false;
-  }
+  if (Lexer.is(AsmToken::Hash))
+    return ParseCppHashLineFilenameComment(IDLoc);
 
   // Allow an integer followed by a ':' as a directional local label.
   if (Lexer.is(AsmToken::Integer)) {
@@ -1117,15 +1135,8 @@ bool AsmParser::ParseStatement() {
 
     if (IDVal == ".globl" || IDVal == ".global")
       return ParseDirectiveSymbolAttribute(MCSA_Global);
-    // ELF only? Should it be here?
-    if (IDVal == ".local")
-      return ParseDirectiveSymbolAttribute(MCSA_Local);
-    if (IDVal == ".hidden")
-      return ParseDirectiveSymbolAttribute(MCSA_Hidden);
     if (IDVal == ".indirect_symbol")
       return ParseDirectiveSymbolAttribute(MCSA_IndirectSymbol);
-    if (IDVal == ".internal")
-      return ParseDirectiveSymbolAttribute(MCSA_Internal);
     if (IDVal == ".lazy_reference")
       return ParseDirectiveSymbolAttribute(MCSA_LazyReference);
     if (IDVal == ".no_dead_strip")
@@ -1134,12 +1145,8 @@ bool AsmParser::ParseStatement() {
       return ParseDirectiveSymbolAttribute(MCSA_SymbolResolver);
     if (IDVal == ".private_extern")
       return ParseDirectiveSymbolAttribute(MCSA_PrivateExtern);
-    if (IDVal == ".protected")
-      return ParseDirectiveSymbolAttribute(MCSA_Protected);
     if (IDVal == ".reference")
       return ParseDirectiveSymbolAttribute(MCSA_Reference);
-    if (IDVal == ".weak")
-      return ParseDirectiveSymbolAttribute(MCSA_Weak);
     if (IDVal == ".weak_definition")
       return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
     if (IDVal == ".weak_reference")
@@ -1157,7 +1164,7 @@ bool AsmParser::ParseStatement() {
     if (IDVal == ".include")
       return ParseDirectiveInclude();
 
-    if (IDVal == ".code16" || IDVal == ".code32" || IDVal == ".code64")
+    if (IDVal == ".code16")
       return TokError(Twine(IDVal) + " not supported yet");
 
     // Look up the handler in the handler table.
@@ -1215,6 +1222,108 @@ bool AsmParser::ParseStatement() {
   return false;
 }
 
+/// EatToEndOfLine uses the Lexer to eat the characters to the end of the line
+/// since they may not be able to be tokenized to get to the end of line token.
+void AsmParser::EatToEndOfLine() {
+  if (!Lexer.is(AsmToken::EndOfStatement))
+    Lexer.LexUntilEndOfLine();
+ // Eat EOL.
+ Lex();
+}
+
+/// ParseCppHashLineFilenameComment as this:
+///   ::= # number "filename"
+/// or just as a full line comment if it doesn't have a number and a string.
+bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
+  Lex(); // Eat the hash token.
+
+  if (getLexer().isNot(AsmToken::Integer)) {
+    // Consume the line since in cases it is not a well-formed line directive,
+    // as if were simply a full line comment.
+    EatToEndOfLine();
+    return false;
+  }
+
+  int64_t LineNumber = getTok().getIntVal();
+  Lex();
+
+  if (getLexer().isNot(AsmToken::String)) {
+    EatToEndOfLine();
+    return false;
+  }
+
+  StringRef Filename = getTok().getString();
+  // Get rid of the enclosing quotes.
+  Filename = Filename.substr(1, Filename.size()-2);
+
+  // Save the SMLoc, Filename and LineNumber for later use by diagnostics.
+  CppHashLoc = L;
+  CppHashFilename = Filename;
+  CppHashLineNumber = LineNumber;
+
+  // Ignore any trailing characters, they're just comment.
+  EatToEndOfLine();
+  return false;
+}
+
+/// DiagHandler - will use the the last parsed cpp hash line filename comment
+/// for the Filename and LineNo if any in the diagnostic.
+void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
+  const AsmParser *Parser = static_cast<const AsmParser*>(Context);
+  raw_ostream &OS = errs();
+
+  const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
+  const SMLoc &DiagLoc = Diag.getLoc();
+  int DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  int CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
+
+  // Like SourceMgr::PrintMessage() we need to print the include stack if any
+  // before printing the message.
+  int DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  if (!Parser->SavedDiagHandler && DiagCurBuffer > 0) {
+     SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
+     DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
+  }
+
+  // If we have not parsed a cpp hash line filename comment or the source 
+  // manager changed or buffer changed (like in a nested include) then just
+  // print the normal diagnostic using its Filename and LineNo.
+  if (!Parser->CppHashLineNumber ||
+      &DiagSrcMgr != &Parser->SrcMgr ||
+      DiagBuf != CppHashBuf) {
+    if (Parser->SavedDiagHandler)
+      Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
+    else
+      Diag.Print(0, OS);
+    return;
+  }
+
+  // Use the CppHashFilename and calculate a line number based on the 
+  // CppHashLoc and CppHashLineNumber relative to this Diag's SMLoc for
+  // the diagnostic.
+  const std::string Filename = Parser->CppHashFilename;
+
+  int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
+  int CppHashLocLineNo =
+      Parser->SrcMgr.FindLineNumber(Parser->CppHashLoc, CppHashBuf);
+  int LineNo = Parser->CppHashLineNumber - 1 +
+               (DiagLocLineNo - CppHashLocLineNo);
+
+  SMDiagnostic NewDiag(*Diag.getSourceMgr(),
+                       Diag.getLoc(),
+                       Filename,
+                       LineNo,
+                       Diag.getColumnNo(),
+                       Diag.getMessage(),
+                       Diag.getLineContents(),
+                       Diag.getShowLine());
+
+  if (Parser->SavedDiagHandler)
+    Parser->SavedDiagHandler(NewDiag, Parser->SavedDiagContext);
+  else
+    NewDiag.Print(0, OS);
+}
+
 bool AsmParser::expandMacro(SmallString<256> &Buf, StringRef Body,
                             const std::vector<StringRef> &Parameters,
                             const std::vector<std::vector<AsmToken> > &A,
@@ -1923,12 +2032,17 @@ bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       StringRef Name;
+      SMLoc Loc = getTok().getLoc();
 
       if (ParseIdentifier(Name))
-        return TokError("expected identifier in directive");
+        return Error(Loc, "expected identifier in directive");
 
       MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
+      // Assembler local symbols don't make any sense here. Complain loudly.
+      if (Sym->isTemporary())
+        return Error(Loc, "non-local symbol required in directive");
+
       getStreamer().EmitSymbolAttribute(Sym, Attr);
 
       if (getLexer().is(AsmToken::EndOfStatement))
@@ -2416,7 +2530,7 @@ bool GenericAsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
     if (getParser().getTargetParser().ParseRegister(RegNo, DirectiveLoc,
       DirectiveLoc))
       return true;
-    Register = getContext().getTargetAsmInfo().getDwarfRegNum(RegNo, true);
+    Register = getContext().getRegisterInfo().getDwarfRegNum(RegNo, true);
   } else
     return getParser().ParseAbsoluteExpression(Register);
 
@@ -2724,8 +2838,8 @@ bool GenericAsmParser::ParseDirectiveLEB128(StringRef DirName, SMLoc) {
 
 
 /// \brief Create an MCAsmParser instance.
-MCAsmParser *llvm::createMCAsmParser(const Target &T, SourceMgr &SM,
+MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
                                      MCContext &C, MCStreamer &Out,
                                      const MCAsmInfo &MAI) {
-  return new AsmParser(T, SM, C, Out, MAI);
+  return new AsmParser(SM, C, Out, MAI);
 }
diff --git a/lib/MC/MCParser/CMakeLists.txt b/lib/MC/MCParser/CMakeLists.txt
index eaea9f6..299d281 100644
--- a/lib/MC/MCParser/CMakeLists.txt
+++ b/lib/MC/MCParser/CMakeLists.txt
@@ -7,5 +7,10 @@ add_llvm_library(LLVMMCParser
   MCAsmLexer.cpp
   MCAsmParser.cpp
   MCAsmParserExtension.cpp
-  TargetAsmParser.cpp
+  MCTargetAsmParser.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMCParser
+  LLVMMC
+  LLVMSupport
   )
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index 66ad384..185b516 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -8,15 +8,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/COFF.h"
 using namespace llvm;
 
@@ -72,6 +73,7 @@ class COFFAsmParser : public MCAsmParserExtension {
                                                               ".seh_pushframe");
     AddDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveEndProlog>(
                                                             ".seh_endprologue");
+    AddDirectiveHandler<&COFFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
   }
 
   bool ParseSectionDirectiveText(StringRef, SMLoc) {
@@ -118,12 +120,44 @@ class COFFAsmParser : public MCAsmParserExtension {
 
   bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
   bool ParseSEHRegisterNumber(unsigned &RegNo);
+  bool ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc);
 public:
   COFFAsmParser() {}
 };
 
 } // end annonomous namespace.
 
+/// ParseDirectiveSymbolAttribute
+///  ::= { ".weak", ... } [ identifier ( , identifier )* ]
+bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
+  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Directive)
+    .Case(".weak", MCSA_Weak)
+    .Default(MCSA_Invalid);
+  assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      StringRef Name;
+
+      if (getParser().ParseIdentifier(Name))
+        return TokError("expected identifier in directive");
+
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+      getStreamer().EmitSymbolAttribute(Sym, Attr);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
@@ -401,12 +435,16 @@ bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
 bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
   SMLoc startLoc = getLexer().getLoc();
   if (getLexer().is(AsmToken::Percent)) {
-    const TargetAsmInfo &TAI = getContext().getTargetAsmInfo();
+    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
     SMLoc endLoc;
     unsigned LLVMRegNo;
     if (getParser().getTargetParser().ParseRegister(LLVMRegNo,startLoc,endLoc))
       return true;
 
+#if 0
+    // FIXME: TargetAsmInfo::getCalleeSavedRegs() commits a serious layering
+    // violation so this validation code is disabled.
+
     // Check that this is a non-volatile register.
     const unsigned *NVRegs = TAI.getCalleeSavedRegs();
     unsigned i;
@@ -415,8 +453,9 @@ bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
         break;
     if (NVRegs[i] == 0)
       return Error(startLoc, "expected non-volatile register");
+#endif
 
-    int SEHRegNo = TAI.getSEHRegNum(LLVMRegNo);
+    int SEHRegNo = MRI.getSEHRegNum(LLVMRegNo);
     if (SEHRegNo < 0)
       return Error(startLoc,"register can't be represented in SEH unwind info");
     RegNo = SEHRegNo;
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index dcf689a..d891126 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -47,12 +47,17 @@ public:
     AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveRoData>(".rodata");
     AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTData>(".tdata");
     AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveTBSS>(".tbss");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveDataRel>(".data.rel");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveDataRelRoLocal>(".data.rel.ro.local");
-    AddDirectiveHandler<&ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveDataRel>(".data.rel");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveDataRelRo>(".data.rel.ro");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveDataRelRoLocal>(".data.rel.ro.local");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseSectionDirectiveEhFrame>(".eh_frame");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSection>(".section");
-    AddDirectiveHandler<&ELFAsmParser::ParseDirectivePushSection>(".pushsection");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseDirectivePushSection>(".pushsection");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectivePopSection>(".popsection");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSize>(".size");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectivePrevious>(".previous");
@@ -60,6 +65,14 @@ public:
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveIdent>(".ident");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymver>(".symver");
     AddDirectiveHandler<&ELFAsmParser::ParseDirectiveWeakref>(".weakref");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
+    AddDirectiveHandler<&ELFAsmParser::ParseDirectiveSymbolAttribute>(".local");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseDirectiveSymbolAttribute>(".protected");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseDirectiveSymbolAttribute>(".internal");
+    AddDirectiveHandler<
+      &ELFAsmParser::ParseDirectiveSymbolAttribute>(".hidden");
   }
 
   // FIXME: Part of this logic is duplicated in the MCELFStreamer. What is
@@ -129,6 +142,7 @@ public:
   bool ParseDirectiveIdent(StringRef, SMLoc);
   bool ParseDirectiveSymver(StringRef, SMLoc);
   bool ParseDirectiveWeakref(StringRef, SMLoc);
+  bool ParseDirectiveSymbolAttribute(StringRef, SMLoc);
 
 private:
   bool ParseSectionName(StringRef &SectionName);
@@ -136,6 +150,41 @@ private:
 
 }
 
+/// ParseDirectiveSymbolAttribute
+///  ::= { ".local", ".weak", ... } [ identifier ( , identifier )* ]
+bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
+  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Directive)
+    .Case(".weak", MCSA_Weak)
+    .Case(".local", MCSA_Local)
+    .Case(".hidden", MCSA_Hidden)
+    .Case(".internal", MCSA_Internal)
+    .Case(".protected", MCSA_Protected)
+    .Default(MCSA_Invalid);
+  assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      StringRef Name;
+
+      if (getParser().ParseIdentifier(Name))
+        return TokError("expected identifier in directive");
+
+      MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+      getStreamer().EmitSymbolAttribute(Sym, Attr);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in directive");
+      Lex();
+    }
+  }
+
+  Lex();
+  return false;
+}
+
 bool ELFAsmParser::ParseSectionSwitch(StringRef Section, unsigned Type,
                                       unsigned Flags, SectionKind Kind) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 4030e41..5239ec7 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -8,13 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/ADT/Twine.h"
 using namespace llvm;
 
 MCAsmParser::MCAsmParser() : TargetParser(0), ShowParsedOperands(0) {
@@ -23,7 +23,7 @@ MCAsmParser::MCAsmParser() : TargetParser(0), ShowParsedOperands(0) {
 MCAsmParser::~MCAsmParser() {
 }
 
-void MCAsmParser::setTargetParser(TargetAsmParser &P) {
+void MCAsmParser::setTargetParser(MCTargetAsmParser &P) {
   assert(!TargetParser && "Target parser is already initialized!");
   TargetParser = &P;
   TargetParser->Initialize(*this);
diff --git a/lib/MC/MCParser/TargetAsmParser.cpp b/lib/MC/MCParser/MCTargetAsmParser.cpp
index 512f6b0..6fb1ba4 100644
--- a/lib/MC/MCParser/TargetAsmParser.cpp
+++ b/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- TargetAsmParser.cpp - Target Assembly Parser -----------------------==//
+//===-- MCTargetAsmParser.cpp - Target Assembly Parser ---------------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 using namespace llvm;
 
-TargetAsmParser::TargetAsmParser()
+MCTargetAsmParser::MCTargetAsmParser()
   : AvailableFeatures(0)
 {
 }
 
-TargetAsmParser::~TargetAsmParser() {
+MCTargetAsmParser::~MCTargetAsmParser() {
 }
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index 6098e6b..086c922 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -28,7 +28,7 @@ private:
   virtual void EmitInstToData(const MCInst &Inst);
 
 public:
-  MCPureStreamer(MCContext &Context, TargetAsmBackend &TAB,
+  MCPureStreamer(MCContext &Context, MCAsmBackend &TAB,
                  raw_ostream &OS, MCCodeEmitter *Emitter)
     : MCObjectStreamer(Context, TAB, OS, Emitter) {}
 
@@ -86,7 +86,8 @@ public:
   virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
     report_fatal_error("unsupported directive in pure streamer");
   }
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment) {
     report_fatal_error("unsupported directive in pure streamer");
   }
   virtual void EmitFileDirective(StringRef Filename) {
@@ -228,7 +229,7 @@ void MCPureStreamer::Finish() {
   this->MCObjectStreamer::Finish();
 }
 
-MCStreamer *llvm::createPureStreamer(MCContext &Context, TargetAsmBackend &TAB,
+MCStreamer *llvm::createPureStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      raw_ostream &OS, MCCodeEmitter *CE) {
-  return new MCPureStreamer(Context, TAB, OS, CE);
+  return new MCPureStreamer(Context, MAB, OS, CE);
 }
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 6e96b78..3afa22b 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -16,13 +16,17 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include <cstdlib>
 using namespace llvm;
 
 MCStreamer::MCStreamer(MCContext &Ctx) : Context(Ctx), EmitEHFrame(true),
                                          EmitDebugFrame(false),
-                                         CurrentW64UnwindInfo(0) {
+                                         CurrentW64UnwindInfo(0),
+                                         LastSymbol(0),
+                                         UniqueCodeBeginSuffix(0),
+                                         UniqueDataBeginSuffix(0) {
   const MCSection *section = NULL;
   SectionStack.push_back(std::make_pair(section, section));
 }
@@ -171,10 +175,94 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection() && "Cannot emit before setting section!");
   Symbol->setSection(*getCurrentSection());
+  LastSymbol = Symbol;
+}
 
-  StringRef Prefix = getContext().getAsmInfo().getPrivateGlobalPrefix();
-  if (!Symbol->getName().startswith(Prefix))
-    LastNonPrivate = Symbol;
+void MCStreamer::EmitDataRegion() {
+  if (RegionIndicator == Data) return;
+
+  MCContext &Context = getContext();
+  const MCAsmInfo &MAI = Context.getAsmInfo();
+  if (!MAI.getSupportsDataRegions()) return;
+
+  // Generate a unique symbol name.
+  MCSymbol *NewSym = Context.GetOrCreateSymbol(
+      Twine(MAI.getDataBeginLabelName()) +
+        utostr(UniqueDataBeginSuffix++));
+  EmitLabel(NewSym);
+
+  RegionIndicator = Data;
+}
+
+void MCStreamer::EmitCodeRegion() {
+  if (RegionIndicator == Code) return;
+
+  MCContext &Context = getContext();
+  const MCAsmInfo &MAI = Context.getAsmInfo();
+  if (!MAI.getSupportsDataRegions()) return;
+
+  // Generate a unique symbol name.
+  MCSymbol *NewSym = Context.GetOrCreateSymbol(
+      Twine(MAI.getCodeBeginLabelName()) +
+        utostr(UniqueCodeBeginSuffix++));
+  EmitLabel(NewSym);
+
+  RegionIndicator = Code;
+}
+
+void MCStreamer::EmitJumpTable8Region() {
+  if (RegionIndicator == JumpTable8) return;
+
+  MCContext &Context = getContext();
+  const MCAsmInfo &MAI = Context.getAsmInfo();
+  if (!MAI.getSupportsDataRegions()) return;
+
+  // Generate a unique symbol name.
+  MCSymbol *NewSym = Context.GetOrCreateSymbol(
+      Twine(MAI.getJumpTable8BeginLabelName()) +
+        utostr(UniqueDataBeginSuffix++));
+  EmitLabel(NewSym);
+
+  RegionIndicator = JumpTable8;
+}
+
+void MCStreamer::EmitJumpTable16Region() {
+  if (RegionIndicator == JumpTable16) return;
+
+  MCContext &Context = getContext();
+  const MCAsmInfo &MAI = Context.getAsmInfo();
+  if (!MAI.getSupportsDataRegions()) return;
+
+  // Generate a unique symbol name.
+  MCSymbol *NewSym = Context.GetOrCreateSymbol(
+      Twine(MAI.getJumpTable16BeginLabelName()) +
+        utostr(UniqueDataBeginSuffix++));
+  EmitLabel(NewSym);
+
+  RegionIndicator = JumpTable16;
+}
+
+
+void MCStreamer::EmitJumpTable32Region() {
+  if (RegionIndicator == JumpTable32) return;
+
+  MCContext &Context = getContext();
+  const MCAsmInfo &MAI = Context.getAsmInfo();
+  if (!MAI.getSupportsDataRegions()) return;
+
+  // Generate a unique symbol name.
+  MCSymbol *NewSym = Context.GetOrCreateSymbol(
+      Twine(MAI.getJumpTable32BeginLabelName()) +
+        utostr(UniqueDataBeginSuffix++));
+  EmitLabel(NewSym);
+
+  RegionIndicator = JumpTable32;
+}
+
+void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
+  EnsureValidFrame();
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->CompactUnwindEncoding = CompactUnwindEncoding;
 }
 
 void MCStreamer::EmitCFISections(bool EH, bool Debug) {
@@ -187,11 +275,22 @@ void MCStreamer::EmitCFIStartProc() {
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
   if (CurFrame && !CurFrame->End)
     report_fatal_error("Starting a frame before finishing the previous one!");
+
   MCDwarfFrameInfo Frame;
-  Frame.Begin = getContext().CreateTempSymbol();
-  Frame.Function = LastNonPrivate;
-  EmitLabel(Frame.Begin);
+  Frame.Function = LastSymbol;
+
+  // If the function is externally visible, we need to create a local
+  // symbol to avoid relocations.
+  StringRef Prefix = getContext().getAsmInfo().getPrivateGlobalPrefix();
+  if (LastSymbol && LastSymbol->getName().startswith(Prefix)) {
+    Frame.Begin = LastSymbol;
+  } else {
+    Frame.Begin = getContext().CreateTempSymbol();
+    EmitLabel(Frame.Begin);
+  }
+
   FrameInfos.push_back(Frame);
+  RegionIndicator = Code;
 }
 
 void MCStreamer::EmitCFIEndProc() {
diff --git a/lib/Target/TargetAsmLexer.cpp b/lib/MC/MCTargetAsmLexer.cpp
index d4893ff..c01c914 100644
--- a/lib/Target/TargetAsmLexer.cpp
+++ b/lib/MC/MCTargetAsmLexer.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/Target/TargetAsmLexer.cpp - Target Assembly Lexer ------------===//
+//===-- llvm/MC/MCTargetAsmLexer.cpp - Target Assembly Lexer --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmLexer.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
 using namespace llvm;
 
-TargetAsmLexer::TargetAsmLexer(const Target &T) : TheTarget(T), Lexer(NULL) {}
-TargetAsmLexer::~TargetAsmLexer() {}
+MCTargetAsmLexer::MCTargetAsmLexer(const Target &T)
+  : TheTarget(T), Lexer(NULL) {
+}
+MCTargetAsmLexer::~MCTargetAsmLexer() {}
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index e698384..79e66fc 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -10,10 +10,11 @@
 #include "llvm/MC/MCWin64EH.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/ADT/Twine.h"
 
 namespace llvm {
 
@@ -220,14 +221,36 @@ StringRef MCWin64EHUnwindEmitter::GetSectionSuffix(const MCSymbol *func) {
   return "";
 }
 
+static const MCSection *getWin64EHTableSection(StringRef suffix,
+                                               MCContext &context) {
+  if (suffix == "")
+    return context.getObjectFileInfo()->getXDataSection();
+
+  return context.getCOFFSection((".xdata"+suffix).str(),
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
+}
+
+static const MCSection *getWin64EHFuncTableSection(StringRef suffix,
+                                                   MCContext &context) {
+  if (suffix == "")
+    return context.getObjectFileInfo()->getPDataSection();
+  return context.getCOFFSection((".pdata"+suffix).str(),
+                                COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                                COFF::IMAGE_SCN_MEM_READ |
+                                COFF::IMAGE_SCN_MEM_WRITE,
+                                SectionKind::getDataRel());
+}
+
 void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer,
                                             MCWin64EHUnwindInfo *info) {
   // Switch sections (the static function above is meant to be called from
   // here and from Emit().
   MCContext &context = streamer.getContext();
-  const TargetAsmInfo &TAI = context.getTargetAsmInfo();
   const MCSection *xdataSect =
-    TAI.getWin64EHTableSection(GetSectionSuffix(info->Function));
+    getWin64EHTableSection(GetSectionSuffix(info->Function), context);
   streamer.SwitchSection(xdataSect);
 
   llvm::EmitUnwindInfo(streamer, info);
@@ -236,11 +259,10 @@ void MCWin64EHUnwindEmitter::EmitUnwindInfo(MCStreamer &streamer,
 void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) {
   MCContext &context = streamer.getContext();
   // Emit the unwind info structs first.
-  const TargetAsmInfo &TAI = context.getTargetAsmInfo();
   for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
     MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
     const MCSection *xdataSect =
-      TAI.getWin64EHTableSection(GetSectionSuffix(info.Function));
+      getWin64EHTableSection(GetSectionSuffix(info.Function), context);
     streamer.SwitchSection(xdataSect);
     llvm::EmitUnwindInfo(streamer, &info);
   }
@@ -248,7 +270,7 @@ void MCWin64EHUnwindEmitter::Emit(MCStreamer &streamer) {
   for (unsigned i = 0; i < streamer.getNumW64UnwindInfos(); ++i) {
     MCWin64EHUnwindInfo &info = streamer.getW64UnwindInfo(i);
     const MCSection *pdataSect =
-      TAI.getWin64EHFuncTableSection(GetSectionSuffix(info.Function));
+      getWin64EHFuncTableSection(GetSectionSuffix(info.Function), context);
     streamer.SwitchSection(pdataSect);
     EmitRuntimeFunction(streamer, &info);
   }
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 69efe23..a9219ad 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
@@ -21,7 +22,6 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetAsmBackend.h"
 
 #include <vector>
 using namespace llvm;
@@ -291,7 +291,7 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
   const MCSymbol &Symbol = Data.getSymbol();
   uint8_t Type = 0;
   uint16_t Flags = Data.getFlags();
-  uint32_t Address = 0;
+  uint64_t Address = 0;
 
   // Set the N_TYPE bits. See <mach-o/nlist.h>.
   //
@@ -590,14 +590,28 @@ IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
         return false;
       return true;
     }
+    // For Darwin x86_64, there is one special case when the reference IsPCRel.
+    // If the fragment with the reference does not have a base symbol but meets
+    // the simple way of dealing with this, in that it is a temporary symbol in
+    // the same atom then it is assumed to be fully resolved.  This is needed so
+    // a relocation entry is not created and so the static linker does not
+    // mess up the reference later.
+    else if(!FB.getAtom() &&
+            SA.isTemporary() && SA.isInSection() && &SecA == &SecB){
+      return true;
+    }
   } else {
     if (!TargetObjectWriter->useAggressiveSymbolFolding())
       return false;
   }
 
-  const MCFragment &FA = *Asm.getSymbolData(SA).getFragment();
+  const MCFragment *FA = Asm.getSymbolData(SA).getFragment();
+
+  // Bail if the symbol has no fragment.
+  if (!FA)
+    return false;
 
-  A_Base = FA.getAtom();
+  A_Base = FA->getAtom();
   if (!A_Base)
     return false;
 
@@ -613,7 +627,8 @@ IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
   return false;
 }
 
-void MachObjectWriter::WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) {
+void MachObjectWriter::WriteObject(MCAssembler &Asm,
+				   const MCAsmLayout &Layout) {
   unsigned NumSections = Asm.size();
 
   // The section data starts after the header, the segment load command (and
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 101237a..b15e225 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -33,7 +33,7 @@
 
 #include "llvm/Support/TimeValue.h"
 
-#include "../Target/X86/X86FixupKinds.h"
+#include "../Target/X86/MCTargetDesc/X86FixupKinds.h"
 
 #include <cstdio>
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 6c36c12..7409daf 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -24,13 +24,13 @@
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCWin64EH.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmBackend.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/ADT/StringMap.h"
 
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -40,7 +40,7 @@ public:
   MCSymbol const *CurSymbol;
 
   WinCOFFStreamer(MCContext &Context,
-                  TargetAsmBackend &TAB,
+                  MCAsmBackend &MAB,
                   MCCodeEmitter &CE,
                   raw_ostream &OS);
 
@@ -63,7 +63,8 @@ public:
   virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                 unsigned ByteAlignment);
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size);
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                             unsigned Size,unsigned ByteAlignment);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
@@ -123,10 +124,10 @@ private:
 } // end anonymous namespace.
 
 WinCOFFStreamer::WinCOFFStreamer(MCContext &Context,
-                                 TargetAsmBackend &TAB,
+                                 MCAsmBackend &MAB,
                                  MCCodeEmitter &CE,
                                  raw_ostream &OS)
-    : MCObjectStreamer(Context, TAB, OS, &CE)
+    : MCObjectStreamer(Context, MAB, OS, &CE)
     , CurSymbol(NULL) {
 }
 
@@ -304,11 +305,12 @@ void WinCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   AddCommonSymbol(Symbol, Size, ByteAlignment, true);
 }
 
-void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {
+void WinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                            unsigned ByteAlignment) {
   assert((Symbol->isInSection()
          ? Symbol->getSection().getVariant() == MCSection::SV_COFF
          : true) && "Got non COFF section in the COFF backend!");
-  AddCommonSymbol(Symbol, Size, 1, false);
+  AddCommonSymbol(Symbol, Size, ByteAlignment, false);
 }
 
 void WinCOFFStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
@@ -395,11 +397,11 @@ void WinCOFFStreamer::Finish() {
 namespace llvm
 {
   MCStreamer *createWinCOFFStreamer(MCContext &Context,
-                                    TargetAsmBackend &TAB,
+                                    MCAsmBackend &MAB,
                                     MCCodeEmitter &CE,
                                     raw_ostream &OS,
                                     bool RelaxAll) {
-    WinCOFFStreamer *S = new WinCOFFStreamer(Context, TAB, CE, OS);
+    WinCOFFStreamer *S = new WinCOFFStreamer(Context, MAB, CE, OS);
     S->getAssembler().setRelaxAll(RelaxAll);
     return S;
   }
diff --git a/lib/Makefile b/lib/Makefile
index ed27854..fd575cd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,7 @@ LEVEL = ..
 include $(LEVEL)/Makefile.config
 
 PARALLEL_DIRS := VMCore AsmParser Bitcode Archive Analysis Transforms CodeGen \
-                Target ExecutionEngine Linker MC CompilerDriver Object
+                Target ExecutionEngine Linker MC Object DebugInfo
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
new file mode 100644
index 0000000..e2eaff5
--- /dev/null
+++ b/lib/Object/Archive.cpp
@@ -0,0 +1,172 @@
+//===- Archive.cpp - ar File Format implementation --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ArchiveObjectFile class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/Archive.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+const StringRef Magic = "!<arch>\n";
+
+struct ArchiveMemberHeader {
+  char Name[16];
+  char LastModified[12];
+  char UID[6];
+  char GID[6];
+  char AccessMode[8];
+  char Size[10]; //< Size of data, not including header or padding.
+  char Terminator[2];
+
+  ///! Get the name without looking up long names.
+  StringRef getName() const {
+    char EndCond = Name[0] == '/' ? ' ' : '/';
+    StringRef::size_type end = StringRef(Name, sizeof(Name)).find(EndCond);
+    if (end == StringRef::npos)
+      end = sizeof(Name);
+    assert(end <= sizeof(Name) && end > 0);
+    // Don't include the EndCond if there is one.
+    return StringRef(Name, end);
+  }
+
+  uint64_t getSize() const {
+    APInt ret;
+    StringRef(Size, sizeof(Size)).getAsInteger(10, ret);
+    return ret.getZExtValue();
+  }
+};
+
+const ArchiveMemberHeader *ToHeader(const char *base) {
+  return reinterpret_cast<const ArchiveMemberHeader *>(base);
+}
+}
+
+Archive::Child Archive::Child::getNext() const {
+  size_t SpaceToSkip = sizeof(ArchiveMemberHeader) +
+    ToHeader(Data.data())->getSize();
+  // If it's odd, add 1 to make it even.
+  if (SpaceToSkip & 1)
+    ++SpaceToSkip;
+
+  const char *NextLoc = Data.data() + SpaceToSkip;
+
+  // Check to see if this is past the end of the archive.
+  if (NextLoc >= Parent->Data->getBufferEnd())
+    return Child(Parent, StringRef(0, 0));
+
+  size_t NextSize = sizeof(ArchiveMemberHeader) +
+    ToHeader(NextLoc)->getSize();
+
+  return Child(Parent, StringRef(NextLoc, NextSize));
+}
+
+error_code Archive::Child::getName(StringRef &Result) const {
+  StringRef name = ToHeader(Data.data())->getName();
+  // Check if it's a special name.
+  if (name[0] == '/') {
+    if (name.size() == 1) { // Linker member.
+      Result = name;
+      return object_error::success;
+    }
+    if (name.size() == 2 && name[1] == '/') { // String table.
+      Result = name;
+      return object_error::success;
+    }
+    // It's a long name.
+    // Get the offset.
+    APInt offset;
+    name.substr(1).getAsInteger(10, offset);
+    const char *addr = Parent->StringTable->Data.begin()
+                       + sizeof(ArchiveMemberHeader)
+                       + offset.getZExtValue();
+    // Verify it.
+    if (Parent->StringTable == Parent->end_children()
+        || addr < (Parent->StringTable->Data.begin()
+                   + sizeof(ArchiveMemberHeader))
+        || addr > (Parent->StringTable->Data.begin()
+                   + sizeof(ArchiveMemberHeader)
+                   + Parent->StringTable->getSize()))
+      return object_error::parse_failed;
+    Result = addr;
+    return object_error::success;
+  }
+  // It's a simple name.
+  if (name[name.size() - 1] == '/')
+    Result = name.substr(0, name.size() - 1);
+  else
+    Result = name;
+  return object_error::success;
+}
+
+uint64_t Archive::Child::getSize() const {
+  return ToHeader(Data.data())->getSize();
+}
+
+MemoryBuffer *Archive::Child::getBuffer() const {
+  StringRef name;
+  if (getName(name)) return NULL;
+  return MemoryBuffer::getMemBuffer(Data.substr(sizeof(ArchiveMemberHeader),
+                                                getSize()),
+                                    name,
+                                    false);
+}
+
+error_code Archive::Child::getAsBinary(OwningPtr<Binary> &Result) const {
+  OwningPtr<Binary> ret;
+  if (error_code ec =
+    createBinary(getBuffer(), ret))
+    return ec;
+  Result.swap(ret);
+  return object_error::success;
+}
+
+Archive::Archive(MemoryBuffer *source, error_code &ec)
+  : Binary(Binary::isArchive, source)
+  , StringTable(Child(this, StringRef(0, 0))) {
+  // Check for sufficient magic.
+  if (!source || source->getBufferSize()
+                 < (8 + sizeof(ArchiveMemberHeader) + 2) // Smallest archive.
+              || StringRef(source->getBufferStart(), 8) != Magic) {
+    ec = object_error::invalid_file_type;
+    return;
+  }
+
+  // Get the string table. It's the 3rd member.
+  child_iterator StrTable = begin_children();
+  child_iterator e = end_children();
+  for (int i = 0; StrTable != e && i < 2; ++StrTable, ++i) {}
+
+  // Check to see if there were 3 members, or the 3rd member wasn't named "//".
+  StringRef name;
+  if (StrTable != e && !StrTable->getName(name) && name == "//")
+    StringTable = StrTable;
+
+  ec = object_error::success;
+}
+
+Archive::child_iterator Archive::begin_children() const {
+  const char *Loc = Data->getBufferStart() + Magic.size();
+  size_t Size = sizeof(ArchiveMemberHeader) +
+    ToHeader(Loc)->getSize();
+  return Child(this, StringRef(Loc, Size));
+}
+
+Archive::child_iterator Archive::end_children() const {
+  return Child(this, StringRef(0, 0));
+}
+
+namespace llvm {
+
+} // end namespace llvm
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index 4b31c75..4e528d8 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -17,8 +17,9 @@
 #include "llvm/Support/Path.h"
 
 // Include headers for createBinary.
-#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/ObjectFile.h"
 
 using namespace llvm;
 using namespace object;
@@ -50,6 +51,12 @@ error_code object::createBinary(MemoryBuffer *Source,
                                 static_cast<unsigned>(Source->getBufferSize()));
   error_code ec;
   switch (type) {
+    case sys::Archive_FileType: {
+      OwningPtr<Binary> ret(new Archive(scopedSource.take(), ec));
+      if (ec) return ec;
+      Result.swap(ret);
+      return object_error::success;
+    }
     case sys::ELF_Relocatable_FileType:
     case sys::ELF_Executable_FileType:
     case sys::ELF_SharedObject_FileType:
@@ -90,7 +97,7 @@ error_code object::createBinary(MemoryBuffer *Source,
 
 error_code object::createBinary(StringRef Path, OwningPtr<Binary> &Result) {
   OwningPtr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFile(Path, File))
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Path, File))
     return ec;
   return createBinary(File.take(), Result);
 }
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 68e5e94..86eb51a 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMObject
+  Archive.cpp
   Binary.cpp
   COFFObjectFile.cpp
   ELFObjectFile.cpp
@@ -8,3 +9,8 @@ add_llvm_library(LLVMObject
   Object.cpp
   ObjectFile.cpp
   )
+
+add_llvm_library_dependencies(LLVMObject
+  LLVMCore
+  LLVMSupport
+  )
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 07de6bc..750c34d 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/COFF.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 
@@ -114,7 +115,7 @@ error_code COFFObjectFile::getSymbolNext(DataRefImpl Symb,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolAddress(DataRefImpl Symb,
+error_code COFFObjectFile::getSymbolOffset(DataRefImpl Symb,
                                             uint64_t &Result) const {
   const coff_symbol *symb = toSymb(Symb);
   const coff_section *Section = NULL;
@@ -132,6 +133,55 @@ error_code COFFObjectFile::getSymbolAddress(DataRefImpl Symb,
   return object_error::success;
 }
 
+error_code COFFObjectFile::getSymbolAddress(DataRefImpl Symb,
+                                            uint64_t &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  const coff_section *Section = NULL;
+  if (error_code ec = getSection(symb->SectionNumber, Section))
+    return ec;
+  char Type;
+  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
+    return ec;
+  if (Type == 'U' || Type == 'w')
+    Result = UnknownAddressOrSize;
+  else if (Section)
+    Result = reinterpret_cast<uintptr_t>(base() +
+                                         Section->PointerToRawData +
+                                         symb->Value);
+  else
+    Result = reinterpret_cast<uintptr_t>(base() + symb->Value);
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getSymbolType(DataRefImpl Symb,
+                                         SymbolRef::SymbolType &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  Result = SymbolRef::ST_Other;
+  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL &&
+      symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED) {
+    Result = SymbolRef::ST_External;
+  } else {
+    if (symb->Type.ComplexType == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
+      Result = SymbolRef::ST_Function;
+    } else {
+      char Type;
+      if (error_code ec = getSymbolNMTypeChar(Symb, Type))
+        return ec;
+      if (Type == 'r' || Type == 'R') {
+        Result = SymbolRef::ST_Data;
+      }
+    }
+  }
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSymbolGlobal(DataRefImpl Symb,
+                                          bool &Result) const {
+  const coff_symbol *symb = toSymb(Symb);
+  Result = (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL);
+  return object_error::success;
+}
+
 error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
                                          uint64_t &Result) const {
   // FIXME: Return the correct size. This requires looking at all the symbols
@@ -286,6 +336,15 @@ error_code COFFObjectFile::getSectionContents(DataRefImpl Sec,
   return object_error::success;
 }
 
+error_code COFFObjectFile::getSectionAlignment(DataRefImpl Sec,
+                                               uint64_t &Res) const {
+  const coff_section *sec = toSec(Sec);
+  if (!sec)
+    return object_error::parse_failed;
+  Res = uint64_t(1) << (((sec->Characteristics & 0x00F00000) >> 20) - 1);
+  return object_error::success;
+}
+
 error_code COFFObjectFile::isSectionText(DataRefImpl Sec,
                                          bool &Result) const {
   const coff_section *sec = toSec(Sec);
@@ -293,14 +352,61 @@ error_code COFFObjectFile::isSectionText(DataRefImpl Sec,
   return object_error::success;
 }
 
+error_code COFFObjectFile::isSectionData(DataRefImpl Sec,
+                                         bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionBSS(DataRefImpl Sec,
+                                        bool &Result) const {
+  const coff_section *sec = toSec(Sec);
+  Result = sec->Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+  return object_error::success;
+}
+
 error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl Sec,
                                                  DataRefImpl Symb,
                                                  bool &Result) const {
-  // FIXME: Unimplemented.
-  Result = false;
+  const coff_section *sec = toSec(Sec);
+  const coff_symbol *symb = toSymb(Symb);
+  const coff_section *symb_sec;
+  if (error_code ec = getSection(symb->SectionNumber, symb_sec)) return ec;
+  if (symb_sec == sec)
+    Result = true;
+  else
+    Result = false;
   return object_error::success;
 }
 
+relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+  const coff_section *sec = toSec(Sec);
+  DataRefImpl ret;
+  std::memset(&ret, 0, sizeof(ret));
+  if (sec->NumberOfRelocations == 0)
+    ret.p = 0;
+  else
+    ret.p = reinterpret_cast<uintptr_t>(base() + sec->PointerToRelocations);
+
+  return relocation_iterator(RelocationRef(ret, this));
+}
+
+relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+  const coff_section *sec = toSec(Sec);
+  DataRefImpl ret;
+  std::memset(&ret, 0, sizeof(ret));
+  if (sec->NumberOfRelocations == 0)
+    ret.p = 0;
+  else
+    ret.p = reinterpret_cast<uintptr_t>(
+              reinterpret_cast<const coff_relocation*>(
+                base() + sec->PointerToRelocations)
+              + sec->NumberOfRelocations);
+
+  return relocation_iterator(RelocationRef(ret, this));
+}
+
 COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
   : ObjectFile(Binary::isCOFF, Object, ec) {
   // Check that we at least have enough room for a header.
@@ -327,7 +433,7 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
   Header = reinterpret_cast<const coff_file_header *>(base() + HeaderStart);
   if (!checkAddr(Data, ec, uintptr_t(Header), sizeof(coff_file_header)))
     return;
-  
+
   SectionTable =
     reinterpret_cast<const coff_section *>( base()
                                           + HeaderStart
@@ -360,18 +466,18 @@ COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
     ec = object_error::parse_failed;
     return;
   }
-  
+
   ec = object_error::success;
 }
 
-ObjectFile::symbol_iterator COFFObjectFile::begin_symbols() const {
+symbol_iterator COFFObjectFile::begin_symbols() const {
   DataRefImpl ret;
   std::memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(SymbolTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
 
-ObjectFile::symbol_iterator COFFObjectFile::end_symbols() const {
+symbol_iterator COFFObjectFile::end_symbols() const {
   // The symbol table ends where the string table begins.
   DataRefImpl ret;
   std::memset(&ret, 0, sizeof(DataRefImpl));
@@ -379,14 +485,14 @@ ObjectFile::symbol_iterator COFFObjectFile::end_symbols() const {
   return symbol_iterator(SymbolRef(ret, this));
 }
 
-ObjectFile::section_iterator COFFObjectFile::begin_sections() const {
+section_iterator COFFObjectFile::begin_sections() const {
   DataRefImpl ret;
   std::memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(SectionTable);
   return section_iterator(SectionRef(ret, this));
 }
 
-ObjectFile::section_iterator COFFObjectFile::end_sections() const {
+section_iterator COFFObjectFile::end_sections() const {
   DataRefImpl ret;
   std::memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(SectionTable + Header->NumberOfSections);
@@ -445,6 +551,121 @@ error_code COFFObjectFile::getString(uint32_t offset,
   return object_error::success;
 }
 
+error_code COFFObjectFile::getSymbol(uint32_t index,
+                                     const coff_symbol *&Result) const {
+  if (index > 0 && index < Header->NumberOfSymbols)
+    Result = SymbolTable + index;
+  else
+    return object_error::parse_failed;
+  return object_error::success;
+}
+
+const coff_relocation *COFFObjectFile::toRel(DataRefImpl Rel) const {
+  return reinterpret_cast<const coff_relocation*>(Rel.p);
+}
+error_code COFFObjectFile::getRelocationNext(DataRefImpl Rel,
+                                             RelocationRef &Res) const {
+  Rel.p = reinterpret_cast<uintptr_t>(
+            reinterpret_cast<const coff_relocation*>(Rel.p) + 1);
+  Res = RelocationRef(Rel, this);
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                uint64_t &Res) const {
+  Res = toRel(Rel)->VirtualAddress;
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationSymbol(DataRefImpl Rel,
+                                               SymbolRef &Res) const {
+  const coff_relocation* R = toRel(Rel);
+  DataRefImpl Symb;
+  Symb.p = reinterpret_cast<uintptr_t>(SymbolTable + R->SymbolTableIndex);
+  Res = SymbolRef(Symb, this);
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
+                                             uint32_t &Res) const {
+  const coff_relocation* R = toRel(Rel);
+  Res = R->Type;
+  return object_error::success;
+}
+
+#define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \
+  case COFF::enum: res = #enum; break;
+
+error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  const coff_relocation *reloc = toRel(Rel);
+  StringRef res;
+  switch (Header->Machine) {
+  case COFF::IMAGE_FILE_MACHINE_AMD64:
+    switch (reloc->Type) {
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ABSOLUTE);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ADDR64);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ADDR32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ADDR32NB);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_1);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_2);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_3);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_4);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_REL32_5);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SECTION);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SECREL);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SECREL7);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_TOKEN);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SREL32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_PAIR);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_SSPAN32);
+    default:
+      res = "Unknown";
+    }
+    break;
+  case COFF::IMAGE_FILE_MACHINE_I386:
+    switch (reloc->Type) {
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_ABSOLUTE);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_DIR16);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_REL16);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_DIR32);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_DIR32NB);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SEG12);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SECTION);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SECREL);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_TOKEN);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_SECREL7);
+    LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_I386_REL32);
+    default:
+      res = "Unknown";
+    }
+    break;
+  default:
+    res = "Unknown";
+  }
+  Result.append(res.begin(), res.end());
+  return object_error::success;
+}
+
+#undef LLVM_COFF_SWITCH_RELOC_TYPE_NAME
+
+error_code COFFObjectFile::getRelocationAdditionalInfo(DataRefImpl Rel,
+                                                       int64_t &Res) const {
+  Res = 0;
+  return object_error::success;
+}
+error_code COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  const coff_relocation *reloc = toRel(Rel);
+  const coff_symbol *symb = 0;
+  if (error_code ec = getSymbol(reloc->SymbolTableIndex, symb)) return ec;
+  DataRefImpl sym;
+  ::memset(&sym, 0, sizeof(sym));
+  sym.p = reinterpret_cast<uintptr_t>(symb);
+  StringRef symname;
+  if (error_code ec = getSymbolName(sym, symname)) return ec;
+  Result.append(symname.begin(), symname.end());
+  return object_error::success;
+}
+
 namespace llvm {
 
   ObjectFile *ObjectFile::createCOFFObjectFile(MemoryBuffer *Object) {
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index e2ff4df..257d08c 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -14,11 +14,14 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <limits>
 #include <utility>
 
@@ -176,12 +179,89 @@ struct Elf_Sym_Impl : Elf_Sym_Base<target_endianness, is64Bits> {
 }
 
 namespace {
+template<support::endianness target_endianness, bool is64Bits, bool isRela>
+struct Elf_Rel_Base;
+
+template<support::endianness target_endianness>
+struct Elf_Rel_Base<target_endianness, false, false> {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, false)
+  Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
+  Elf_Word      r_info;  // Symbol table index and type of relocation to apply
+};
+
+template<support::endianness target_endianness>
+struct Elf_Rel_Base<target_endianness, true, false> {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, true)
+  Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
+  Elf_Xword     r_info;   // Symbol table index and type of relocation to apply
+};
+
+template<support::endianness target_endianness>
+struct Elf_Rel_Base<target_endianness, false, true> {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, false)
+  Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
+  Elf_Word      r_info;   // Symbol table index and type of relocation to apply
+  Elf_Sword     r_addend; // Compute value for relocatable field by adding this
+};
+
+template<support::endianness target_endianness>
+struct Elf_Rel_Base<target_endianness, true, true> {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, true)
+  Elf_Addr      r_offset; // Location (file byte offset, or program virtual addr)
+  Elf_Xword     r_info;   // Symbol table index and type of relocation to apply
+  Elf_Sxword    r_addend; // Compute value for relocatable field by adding this.
+};
+
+template<support::endianness target_endianness, bool is64Bits, bool isRela>
+struct Elf_Rel_Impl;
+
+template<support::endianness target_endianness, bool isRela>
+struct Elf_Rel_Impl<target_endianness, true, isRela>
+       : Elf_Rel_Base<target_endianness, true, isRela> {
+  using Elf_Rel_Base<target_endianness, true, isRela>::r_info;
+  LLVM_ELF_IMPORT_TYPES(target_endianness, true)
+
+  // These accessors and mutators correspond to the ELF64_R_SYM, ELF64_R_TYPE,
+  // and ELF64_R_INFO macros defined in the ELF specification:
+  uint64_t getSymbol() const { return (r_info >> 32); }
+  unsigned char getType() const {
+    return (unsigned char) (r_info & 0xffffffffL);
+  }
+  void setSymbol(uint64_t s) { setSymbolAndType(s, getType()); }
+  void setType(unsigned char t) { setSymbolAndType(getSymbol(), t); }
+  void setSymbolAndType(uint64_t s, unsigned char t) {
+    r_info = (s << 32) + (t&0xffffffffL);
+  }
+};
+
+template<support::endianness target_endianness, bool isRela>
+struct Elf_Rel_Impl<target_endianness, false, isRela>
+       : Elf_Rel_Base<target_endianness, false, isRela> {
+  using Elf_Rel_Base<target_endianness, false, isRela>::r_info;
+  LLVM_ELF_IMPORT_TYPES(target_endianness, false)
+
+  // These accessors and mutators correspond to the ELF32_R_SYM, ELF32_R_TYPE,
+  // and ELF32_R_INFO macros defined in the ELF specification:
+  uint32_t getSymbol() const { return (r_info >> 8); }
+  unsigned char getType() const { return (unsigned char) (r_info & 0x0ff); }
+  void setSymbol(uint32_t s) { setSymbolAndType(s, getType()); }
+  void setType(unsigned char t) { setSymbolAndType(getSymbol(), t); }
+  void setSymbolAndType(uint32_t s, unsigned char t) {
+    r_info = (s << 8) + t;
+  }
+};
+
+}
+
+namespace {
 template<support::endianness target_endianness, bool is64Bits>
 class ELFObjectFile : public ObjectFile {
   LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
 
   typedef Elf_Shdr_Impl<target_endianness, is64Bits> Elf_Shdr;
   typedef Elf_Sym_Impl<target_endianness, is64Bits> Elf_Sym;
+  typedef Elf_Rel_Impl<target_endianness, is64Bits, false> Elf_Rel;
+  typedef Elf_Rel_Impl<target_endianness, is64Bits, true> Elf_Rela;
 
   struct Elf_Ehdr {
     unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
@@ -206,37 +286,81 @@ class ELFObjectFile : public ObjectFile {
     unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
   };
 
-  typedef SmallVector<const Elf_Shdr*, 1> SymbolTableSections_t;
+  typedef SmallVector<const Elf_Shdr*, 1> Sections_t;
+  typedef DenseMap<unsigned, unsigned> IndexMap_t;
+  typedef DenseMap<const Elf_Shdr*, SmallVector<uint32_t, 1> > RelocMap_t;
 
   const Elf_Ehdr *Header;
   const Elf_Shdr *SectionHeaderTable;
   const Elf_Shdr *dot_shstrtab_sec; // Section header string table.
   const Elf_Shdr *dot_strtab_sec;   // Symbol header string table.
-  SymbolTableSections_t SymbolTableSections;
+  Sections_t SymbolTableSections;
+  IndexMap_t SymbolTableSectionsIndexMap;
+  DenseMap<const Elf_Sym*, ELF::Elf64_Word> ExtendedSymbolTable;
+
+  /// @brief Map sections to an array of relocation sections that reference
+  ///        them sorted by section index.
+  RelocMap_t SectionRelocMap;
+
+  /// @brief Get the relocation section that contains \a Rel.
+  const Elf_Shdr *getRelSection(DataRefImpl Rel) const {
+    return getSection(Rel.w.b);
+  }
 
   void            validateSymbol(DataRefImpl Symb) const;
+  bool            isRelocationHasAddend(DataRefImpl Rel) const;
+  template<typename T>
+  const T        *getEntry(uint16_t Section, uint32_t Entry) const;
+  template<typename T>
+  const T        *getEntry(const Elf_Shdr *Section, uint32_t Entry) const;
   const Elf_Sym  *getSymbol(DataRefImpl Symb) const;
   const Elf_Shdr *getSection(DataRefImpl index) const;
-  const Elf_Shdr *getSection(uint16_t index) const;
-  const char     *getString(uint16_t section, uint32_t offset) const;
+  const Elf_Shdr *getSection(uint32_t index) const;
+  const Elf_Rel  *getRel(DataRefImpl Rel) const;
+  const Elf_Rela *getRela(DataRefImpl Rela) const;
+  const char     *getString(uint32_t section, uint32_t offset) const;
   const char     *getString(const Elf_Shdr *section, uint32_t offset) const;
+  error_code      getSymbolName(const Elf_Sym *Symb, StringRef &Res) const;
 
 protected:
   virtual error_code getSymbolNext(DataRefImpl Symb, SymbolRef &Res) const;
   virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const;
+  virtual error_code getSymbolOffset(DataRefImpl Symb, uint64_t &Res) const;
   virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const;
   virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const;
   virtual error_code getSymbolNMTypeChar(DataRefImpl Symb, char &Res) const;
   virtual error_code isSymbolInternal(DataRefImpl Symb, bool &Res) const;
+  virtual error_code isSymbolGlobal(DataRefImpl Symb, bool &Res) const;
+  virtual error_code getSymbolType(DataRefImpl Symb, SymbolRef::SymbolType &Res) const;
 
   virtual error_code getSectionNext(DataRefImpl Sec, SectionRef &Res) const;
   virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const;
   virtual error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const;
   virtual error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const;
   virtual error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const;
+  virtual error_code getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const;
   virtual error_code isSectionText(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionData(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionBSS(DataRefImpl Sec, bool &Res) const;
   virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
                                            bool &Result) const;
+  virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const;
+  virtual relocation_iterator getSectionRelEnd(DataRefImpl Sec) const;
+
+  virtual error_code getRelocationNext(DataRefImpl Rel,
+                                       RelocationRef &Res) const;
+  virtual error_code getRelocationAddress(DataRefImpl Rel,
+                                          uint64_t &Res) const;
+  virtual error_code getRelocationSymbol(DataRefImpl Rel,
+                                         SymbolRef &Res) const;
+  virtual error_code getRelocationType(DataRefImpl Rel,
+                                       uint32_t &Res) const;
+  virtual error_code getRelocationTypeName(DataRefImpl Rel,
+                                           SmallVectorImpl<char> &Result) const;
+  virtual error_code getRelocationAdditionalInfo(DataRefImpl Rel,
+                                                 int64_t &Res) const;
+  virtual error_code getRelocationValueString(DataRefImpl Rel,
+                                           SmallVectorImpl<char> &Result) const;
 
 public:
   ELFObjectFile(MemoryBuffer *Object, error_code &ec);
@@ -248,6 +372,11 @@ public:
   virtual uint8_t getBytesInAddress() const;
   virtual StringRef getFileFormatName() const;
   virtual unsigned getArch() const;
+
+  uint64_t getNumSections() const;
+  uint64_t getStringTableIndex() const;
+  ELF::Elf64_Word getSymbolTableIndex(const Elf_Sym *symb) const;
+  const Elf_Shdr *getSection(const Elf_Sym *symb) const;
 };
 } // end namespace
 
@@ -299,29 +428,37 @@ error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSymbolName(DataRefImpl Symb,
                                         StringRef &Result) const {
   validateSymbol(Symb);
-  const Elf_Sym  *symb = getSymbol(Symb);
-  if (symb->st_name == 0) {
-    const Elf_Shdr *section = getSection(symb->st_shndx);
-    if (!section)
-      Result = "";
-    else
-      Result = getString(dot_shstrtab_sec, section->sh_name);
-    return object_error::success;
-  }
+  const Elf_Sym *symb = getSymbol(Symb);
+  return getSymbolName(symb, Result);
+}
 
-  // Use the default symbol table name section.
-  Result = getString(dot_strtab_sec, symb->st_name);
-  return object_error::success;
+template<support::endianness target_endianness, bool is64Bits>
+ELF::Elf64_Word ELFObjectFile<target_endianness, is64Bits>
+                      ::getSymbolTableIndex(const Elf_Sym *symb) const {
+  if (symb->st_shndx == ELF::SHN_XINDEX)
+    return ExtendedSymbolTable.lookup(symb);
+  return symb->st_shndx;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Shdr *
+ELFObjectFile<target_endianness, is64Bits>
+                             ::getSection(const Elf_Sym *symb) const {
+  if (symb->st_shndx == ELF::SHN_XINDEX)
+    return getSection(ExtendedSymbolTable.lookup(symb));
+  if (symb->st_shndx >= ELF::SHN_LORESERVE)
+    return 0;
+  return getSection(symb->st_shndx);
 }
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
-                        ::getSymbolAddress(DataRefImpl Symb,
-                                           uint64_t &Result) const {
+                        ::getSymbolOffset(DataRefImpl Symb,
+                                          uint64_t &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
   const Elf_Shdr *Section;
-  switch (symb->st_shndx) {
+  switch (getSymbolTableIndex(symb)) {
   case ELF::SHN_COMMON:
    // Undefined symbols have no address yet.
   case ELF::SHN_UNDEF:
@@ -330,7 +467,7 @@ error_code ELFObjectFile<target_endianness, is64Bits>
   case ELF::SHN_ABS:
     Result = symb->st_value;
     return object_error::success;
-  default: Section = getSection(symb->st_shndx);
+  default: Section = getSection(symb);
   }
 
   switch (symb->getType()) {
@@ -350,6 +487,43 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getSymbolAddress(DataRefImpl Symb,
+                                           uint64_t &Result) const {
+  validateSymbol(Symb);
+  const Elf_Sym  *symb = getSymbol(Symb);
+  const Elf_Shdr *Section;
+  switch (getSymbolTableIndex(symb)) {
+  case ELF::SHN_COMMON: // Fall through.
+   // Undefined symbols have no address yet.
+  case ELF::SHN_UNDEF:
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  case ELF::SHN_ABS:
+    Result = reinterpret_cast<uintptr_t>(base()+symb->st_value);
+    return object_error::success;
+  default: Section = getSection(symb);
+  }
+  const uint8_t* addr = base();
+  if (Section)
+    addr += Section->sh_offset;
+  switch (symb->getType()) {
+  case ELF::STT_SECTION:
+    Result = reinterpret_cast<uintptr_t>(addr);
+    return object_error::success;
+  case ELF::STT_FUNC: // Fall through.
+  case ELF::STT_OBJECT: // Fall through.
+  case ELF::STT_NOTYPE:
+    addr += symb->st_value;
+    Result = reinterpret_cast<uintptr_t>(addr);
+    return object_error::success;
+  default:
+    Result = UnknownAddressOrSize;
+    return object_error::success;
+  }
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSymbolSize(DataRefImpl Symb,
                                         uint64_t &Result) const {
   validateSymbol(Symb);
@@ -366,7 +540,7 @@ error_code ELFObjectFile<target_endianness, is64Bits>
                                               char &Result) const {
   validateSymbol(Symb);
   const Elf_Sym  *symb = getSymbol(Symb);
-  const Elf_Shdr *Section = getSection(symb->st_shndx);
+  const Elf_Shdr *Section = getSection(symb);
 
   char ret = '?';
 
@@ -389,7 +563,7 @@ error_code ELFObjectFile<target_endianness, is64Bits>
     }
   }
 
-  switch (symb->st_shndx) {
+  switch (getSymbolTableIndex(symb)) {
   case ELF::SHN_UNDEF:
     if (ret == '?')
       ret = 'U';
@@ -401,7 +575,7 @@ error_code ELFObjectFile<target_endianness, is64Bits>
   switch (symb->getBinding()) {
   case ELF::STB_GLOBAL: ret = ::toupper(ret); break;
   case ELF::STB_WEAK:
-    if (symb->st_shndx == ELF::SHN_UNDEF)
+    if (getSymbolTableIndex(symb) == ELF::SHN_UNDEF)
       ret = 'w';
     else
       if (symb->getType() == ELF::STT_OBJECT)
@@ -416,7 +590,8 @@ error_code ELFObjectFile<target_endianness, is64Bits>
       return ec;
     Result = StringSwitch<char>(name)
       .StartsWith(".debug", 'N')
-      .StartsWith(".note", 'n');
+      .StartsWith(".note", 'n')
+      .Default('?');
     return object_error::success;
   }
 
@@ -426,6 +601,43 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getSymbolType(DataRefImpl Symb,
+                                        SymbolRef::SymbolType &Result) const {
+  validateSymbol(Symb);
+  const Elf_Sym  *symb = getSymbol(Symb);
+
+  if (getSymbolTableIndex(symb) == ELF::SHN_UNDEF) {
+    Result = SymbolRef::ST_External;
+    return object_error::success;
+  }
+
+  switch (symb->getType()) {
+  case ELF::STT_FUNC:
+    Result = SymbolRef::ST_Function;
+    break;
+  case ELF::STT_OBJECT:
+    Result = SymbolRef::ST_Data;
+    break;
+  default:
+    Result = SymbolRef::ST_Other;
+    break;
+  }
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::isSymbolGlobal(DataRefImpl Symb,
+                                        bool &Result) const {
+  validateSymbol(Symb);
+  const Elf_Sym  *symb = getSymbol(Symb);
+
+  Result = symb->getBinding() == ELF::STB_GLOBAL;
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                         ::isSymbolInternal(DataRefImpl Symb,
                                            bool &Result) const {
   validateSymbol(Symb);
@@ -487,6 +699,15 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getSectionAlignment(DataRefImpl Sec,
+                                              uint64_t &Result) const {
+  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
+  Result = sec->sh_addralign;
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                         ::isSectionText(DataRefImpl Sec,
                                         bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
@@ -499,6 +720,32 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::isSectionData(DataRefImpl Sec,
+                                        bool &Result) const {
+  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
+  if (sec->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)
+      && sec->sh_type == ELF::SHT_PROGBITS)
+    Result = true;
+  else
+    Result = false;
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::isSectionBSS(DataRefImpl Sec,
+                                       bool &Result) const {
+  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
+  if (sec->sh_flags & (ELF::SHF_ALLOC | ELF::SHF_WRITE)
+      && sec->sh_type == ELF::SHT_NOBITS)
+    Result = true;
+  else
+    Result = false;
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                           ::sectionContainsSymbol(DataRefImpl Sec,
                                                   DataRefImpl Symb,
                                                   bool &Result) const {
@@ -508,6 +755,330 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
+relocation_iterator ELFObjectFile<target_endianness, is64Bits>
+                                 ::getSectionRelBegin(DataRefImpl Sec) const {
+  DataRefImpl RelData;
+  memset(&RelData, 0, sizeof(RelData));
+  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
+  typename RelocMap_t::const_iterator ittr = SectionRelocMap.find(sec);
+  if (sec != 0 && ittr != SectionRelocMap.end()) {
+    RelData.w.a = getSection(ittr->second[0])->sh_info;
+    RelData.w.b = ittr->second[0];
+    RelData.w.c = 0;
+  }
+  return relocation_iterator(RelocationRef(RelData, this));
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+relocation_iterator ELFObjectFile<target_endianness, is64Bits>
+                                 ::getSectionRelEnd(DataRefImpl Sec) const {
+  DataRefImpl RelData;
+  memset(&RelData, 0, sizeof(RelData));
+  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
+  typename RelocMap_t::const_iterator ittr = SectionRelocMap.find(sec);
+  if (sec != 0 && ittr != SectionRelocMap.end()) {
+    // Get the index of the last relocation section for this section.
+    std::size_t relocsecindex = ittr->second[ittr->second.size() - 1];
+    const Elf_Shdr *relocsec = getSection(relocsecindex);
+    RelData.w.a = relocsec->sh_info;
+    RelData.w.b = relocsecindex;
+    RelData.w.c = relocsec->sh_size / relocsec->sh_entsize;
+  }
+  return relocation_iterator(RelocationRef(RelData, this));
+}
+
+// Relocations
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getRelocationNext(DataRefImpl Rel,
+                                            RelocationRef &Result) const {
+  ++Rel.w.c;
+  const Elf_Shdr *relocsec = getSection(Rel.w.b);
+  if (Rel.w.c >= (relocsec->sh_size / relocsec->sh_entsize)) {
+    // We have reached the end of the relocations for this section. See if there
+    // is another relocation section.
+    typename RelocMap_t::mapped_type relocseclist =
+      SectionRelocMap.lookup(getSection(Rel.w.a));
+
+    // Do a binary search for the current reloc section index (which must be
+    // present). Then get the next one.
+    typename RelocMap_t::mapped_type::const_iterator loc =
+      std::lower_bound(relocseclist.begin(), relocseclist.end(), Rel.w.b);
+    ++loc;
+
+    // If there is no next one, don't do anything. The ++Rel.w.c above sets Rel
+    // to the end iterator.
+    if (loc != relocseclist.end()) {
+      Rel.w.b = *loc;
+      Rel.w.a = 0;
+    }
+  }
+  Result = RelocationRef(Rel, this);
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getRelocationSymbol(DataRefImpl Rel,
+                                              SymbolRef &Result) const {
+  uint32_t symbolIdx;
+  const Elf_Shdr *sec = getSection(Rel.w.b);
+  switch (sec->sh_type) {
+    default :
+      report_fatal_error("Invalid section type in Rel!");
+    case ELF::SHT_REL : {
+      symbolIdx = getRel(Rel)->getSymbol();
+      break;
+    }
+    case ELF::SHT_RELA : {
+      symbolIdx = getRela(Rel)->getSymbol();
+      break;
+    }
+  }
+  DataRefImpl SymbolData;
+  IndexMap_t::const_iterator it = SymbolTableSectionsIndexMap.find(sec->sh_link);
+  if (it == SymbolTableSectionsIndexMap.end())
+    report_fatal_error("Relocation symbol table not found!");
+  SymbolData.d.a = symbolIdx;
+  SymbolData.d.b = it->second;
+  Result = SymbolRef(SymbolData, this);
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getRelocationAddress(DataRefImpl Rel,
+                                               uint64_t &Result) const {
+  uint64_t offset;
+  const Elf_Shdr *sec = getSection(Rel.w.b);
+  switch (sec->sh_type) {
+    default :
+      report_fatal_error("Invalid section type in Rel!");
+    case ELF::SHT_REL : {
+      offset = getRel(Rel)->r_offset;
+      break;
+    }
+    case ELF::SHT_RELA : {
+      offset = getRela(Rel)->r_offset;
+      break;
+    }
+  }
+
+  Result = offset;
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getRelocationType(DataRefImpl Rel,
+                                            uint32_t &Result) const {
+  const Elf_Shdr *sec = getSection(Rel.w.b);
+  switch (sec->sh_type) {
+    default :
+      report_fatal_error("Invalid section type in Rel!");
+    case ELF::SHT_REL : {
+      Result = getRel(Rel)->getType();
+      break;
+    }
+    case ELF::SHT_RELA : {
+      Result = getRela(Rel)->getType();
+      break;
+    }
+  }
+  return object_error::success;
+}
+
+#define LLVM_ELF_SWITCH_RELOC_TYPE_NAME(enum) \
+  case ELF::enum: res = #enum; break;
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getRelocationTypeName(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  const Elf_Shdr *sec = getSection(Rel.w.b);
+  uint8_t type;
+  StringRef res;
+  switch (sec->sh_type) {
+    default :
+      return object_error::parse_failed;
+    case ELF::SHT_REL : {
+      type = getRel(Rel)->getType();
+      break;
+    }
+    case ELF::SHT_RELA : {
+      type = getRela(Rel)->getType();
+      break;
+    }
+  }
+  switch (Header->e_machine) {
+  case ELF::EM_X86_64:
+    switch (type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32S);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTTPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC);
+    default:
+      res = "Unknown";
+    }
+    break;
+  case ELF::EM_386:
+    switch (type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTPC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32PLT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTIE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDO_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_IRELATIVE);
+    default:
+      res = "Unknown";
+    }
+    break;
+  default:
+    res = "Unknown";
+  }
+  Result.append(res.begin(), res.end());
+  return object_error::success;
+}
+
+#undef LLVM_ELF_SWITCH_RELOC_TYPE_NAME
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getRelocationAdditionalInfo(DataRefImpl Rel,
+                                                      int64_t &Result) const {
+  const Elf_Shdr *sec = getSection(Rel.w.b);
+  switch (sec->sh_type) {
+    default :
+      report_fatal_error("Invalid section type in Rel!");
+    case ELF::SHT_REL : {
+      Result = 0;
+      return object_error::success;
+    }
+    case ELF::SHT_RELA : {
+      Result = getRela(Rel)->r_addend;
+      return object_error::success;
+    }
+  }
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getRelocationValueString(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  const Elf_Shdr *sec = getSection(Rel.w.b);
+  uint8_t type;
+  StringRef res;
+  int64_t addend = 0;
+  uint16_t symbol_index = 0;
+  switch (sec->sh_type) {
+    default :
+      return object_error::parse_failed;
+    case ELF::SHT_REL : {
+      type = getRel(Rel)->getType();
+      symbol_index = getRel(Rel)->getSymbol();
+      // TODO: Read implicit addend from section data.
+      break;
+    }
+    case ELF::SHT_RELA : {
+      type = getRela(Rel)->getType();
+      symbol_index = getRela(Rel)->getSymbol();
+      addend = getRela(Rel)->r_addend;
+      break;
+    }
+  }
+  const Elf_Sym *symb = getEntry<Elf_Sym>(sec->sh_link, symbol_index);
+  StringRef symname;
+  if (error_code ec = getSymbolName(symb, symname))
+    return ec;
+  switch (Header->e_machine) {
+  case ELF::EM_X86_64:
+    switch (type) {
+    case ELF::R_X86_64_32S:
+      res = symname;
+      break;
+    case ELF::R_X86_64_PC32: {
+        std::string fmtbuf;
+        raw_string_ostream fmt(fmtbuf);
+        fmt << symname << (addend < 0 ? "" : "+") << addend << "-P";
+        fmt.flush();
+        Result.append(fmtbuf.begin(), fmtbuf.end());
+      }
+      break;
+    default:
+      res = "Unknown";
+    }
+    break;
+  default:
+    res = "Unknown";
+  }
+  if (Result.empty())
+    Result.append(res.begin(), res.end());
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
 ELFObjectFile<target_endianness, is64Bits>::ELFObjectFile(MemoryBuffer *Object
                                                           , error_code &ec)
   : ObjectFile(Binary::isELF, Object, ec)
@@ -521,25 +1092,41 @@ ELFObjectFile<target_endianness, is64Bits>::ELFObjectFile(MemoryBuffer *Object
 
   SectionHeaderTable =
     reinterpret_cast<const Elf_Shdr *>(base() + Header->e_shoff);
-  uint32_t SectionTableSize = Header->e_shnum * Header->e_shentsize;
+  uint64_t SectionTableSize = getNumSections() * Header->e_shentsize;
   if (!(  (const uint8_t *)SectionHeaderTable + SectionTableSize
          <= base() + Data->getBufferSize()))
     // FIXME: Proper error handling.
     report_fatal_error("Section table goes past end of file!");
 
 
-  // To find the symbol tables we walk the section table to find SHT_STMTAB.
-  for (const char *i = reinterpret_cast<const char *>(SectionHeaderTable),
-                  *e = i + Header->e_shnum * Header->e_shentsize;
-                   i != e; i += Header->e_shentsize) {
-    const Elf_Shdr *sh = reinterpret_cast<const Elf_Shdr*>(i);
+  // To find the symbol tables we walk the section table to find SHT_SYMTAB.
+  const Elf_Shdr* SymbolTableSectionHeaderIndex = 0;
+  const Elf_Shdr* sh = reinterpret_cast<const Elf_Shdr*>(SectionHeaderTable);
+  for (uint64_t i = 0, e = getNumSections(); i != e; ++i) {
+    if (sh->sh_type == ELF::SHT_SYMTAB_SHNDX) {
+      if (SymbolTableSectionHeaderIndex)
+        // FIXME: Proper error handling.
+        report_fatal_error("More than one .symtab_shndx!");
+      SymbolTableSectionHeaderIndex = sh;
+    }
     if (sh->sh_type == ELF::SHT_SYMTAB) {
+      SymbolTableSectionsIndexMap[i] = SymbolTableSections.size();
       SymbolTableSections.push_back(sh);
     }
+    if (sh->sh_type == ELF::SHT_REL || sh->sh_type == ELF::SHT_RELA) {
+      SectionRelocMap[getSection(sh->sh_info)].push_back(i);
+    }
+    ++sh;
+  }
+
+  // Sort section relocation lists by index.
+  for (typename RelocMap_t::iterator i = SectionRelocMap.begin(),
+                                     e = SectionRelocMap.end(); i != e; ++i) {
+    std::sort(i->second.begin(), i->second.end());
   }
 
   // Get string table sections.
-  dot_shstrtab_sec = getSection(Header->e_shstrndx);
+  dot_shstrtab_sec = getSection(getStringTableIndex());
   if (dot_shstrtab_sec) {
     // Verify that the last byte in the string table in a null.
     if (((const char*)base() + dot_shstrtab_sec->sh_offset)
@@ -550,7 +1137,7 @@ ELFObjectFile<target_endianness, is64Bits>::ELFObjectFile(MemoryBuffer *Object
 
   // Merge this into the above loop.
   for (const char *i = reinterpret_cast<const char *>(SectionHeaderTable),
-                  *e = i + Header->e_shnum * Header->e_shentsize;
+                  *e = i + getNumSections() * Header->e_shentsize;
                    i != e; i += Header->e_shentsize) {
     const Elf_Shdr *sh = reinterpret_cast<const Elf_Shdr*>(i);
     if (sh->sh_type == ELF::SHT_STRTAB) {
@@ -567,11 +1154,26 @@ ELFObjectFile<target_endianness, is64Bits>::ELFObjectFile(MemoryBuffer *Object
       }
     }
   }
+
+  // Build symbol name side-mapping if there is one.
+  if (SymbolTableSectionHeaderIndex) {
+    const Elf_Word *ShndxTable = reinterpret_cast<const Elf_Word*>(base() +
+                                      SymbolTableSectionHeaderIndex->sh_offset);
+    error_code ec;
+    for (symbol_iterator si = begin_symbols(),
+                         se = end_symbols(); si != se; si.increment(ec)) {
+      if (ec)
+        report_fatal_error("Fewer extended symbol table entries than symbols!");
+      if (*ShndxTable != ELF::SHN_UNDEF)
+        ExtendedSymbolTable[getSymbol(si->getRawDataRefImpl())] = *ShndxTable;
+      ++ShndxTable;
+    }
+  }
 }
 
 template<support::endianness target_endianness, bool is64Bits>
-ObjectFile::symbol_iterator ELFObjectFile<target_endianness, is64Bits>
-                                         ::begin_symbols() const {
+symbol_iterator ELFObjectFile<target_endianness, is64Bits>
+                             ::begin_symbols() const {
   DataRefImpl SymbolData;
   memset(&SymbolData, 0, sizeof(SymbolData));
   if (SymbolTableSections.size() == 0) {
@@ -585,8 +1187,8 @@ ObjectFile::symbol_iterator ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
-ObjectFile::symbol_iterator ELFObjectFile<target_endianness, is64Bits>
-                                         ::end_symbols() const {
+symbol_iterator ELFObjectFile<target_endianness, is64Bits>
+                             ::end_symbols() const {
   DataRefImpl SymbolData;
   memset(&SymbolData, 0, sizeof(SymbolData));
   SymbolData.d.a = std::numeric_limits<uint32_t>::max();
@@ -595,8 +1197,8 @@ ObjectFile::symbol_iterator ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
-ObjectFile::section_iterator ELFObjectFile<target_endianness, is64Bits>
-                                          ::begin_sections() const {
+section_iterator ELFObjectFile<target_endianness, is64Bits>
+                              ::begin_sections() const {
   DataRefImpl ret;
   memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(base() + Header->e_shoff);
@@ -604,13 +1206,13 @@ ObjectFile::section_iterator ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
-ObjectFile::section_iterator ELFObjectFile<target_endianness, is64Bits>
-                                          ::end_sections() const {
+section_iterator ELFObjectFile<target_endianness, is64Bits>
+                              ::end_sections() const {
   DataRefImpl ret;
   memset(&ret, 0, sizeof(DataRefImpl));
   ret.p = reinterpret_cast<intptr_t>(base()
                                      + Header->e_shoff
-                                     + (Header->e_shentsize * Header->e_shnum));
+                                     + (Header->e_shentsize*getNumSections()));
   return section_iterator(SectionRef(ret, this));
 }
 
@@ -629,6 +1231,8 @@ StringRef ELFObjectFile<target_endianness, is64Bits>
       return "ELF32-i386";
     case ELF::EM_X86_64:
       return "ELF32-x86-64";
+    case ELF::EM_ARM:
+      return "ELF32-arm";
     default:
       return "ELF32-unknown";
     }
@@ -654,26 +1258,75 @@ unsigned ELFObjectFile<target_endianness, is64Bits>::getArch() const {
     return Triple::x86;
   case ELF::EM_X86_64:
     return Triple::x86_64;
+  case ELF::EM_ARM:
+    return Triple::arm;
   default:
     return Triple::UnknownArch;
   }
 }
 
 template<support::endianness target_endianness, bool is64Bits>
+uint64_t ELFObjectFile<target_endianness, is64Bits>::getNumSections() const {
+  if (Header->e_shnum == ELF::SHN_UNDEF)
+    return SectionHeaderTable->sh_size;
+  return Header->e_shnum;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+uint64_t
+ELFObjectFile<target_endianness, is64Bits>::getStringTableIndex() const {
+  if (Header->e_shnum == ELF::SHN_UNDEF) {
+    if (Header->e_shstrndx == ELF::SHN_HIRESERVE)
+      return SectionHeaderTable->sh_link;
+    if (Header->e_shstrndx >= getNumSections())
+      return 0;
+  }
+  return Header->e_shstrndx;
+}
+
+
+template<support::endianness target_endianness, bool is64Bits>
+template<typename T>
+inline const T *
+ELFObjectFile<target_endianness, is64Bits>::getEntry(uint16_t Section,
+                                                     uint32_t Entry) const {
+  return getEntry<T>(getSection(Section), Entry);
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+template<typename T>
+inline const T *
+ELFObjectFile<target_endianness, is64Bits>::getEntry(const Elf_Shdr * Section,
+                                                     uint32_t Entry) const {
+  return reinterpret_cast<const T *>(
+           base()
+           + Section->sh_offset
+           + (Entry * Section->sh_entsize));
+}
+
+template<support::endianness target_endianness, bool is64Bits>
 const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Sym *
 ELFObjectFile<target_endianness, is64Bits>::getSymbol(DataRefImpl Symb) const {
-  const Elf_Shdr *sec = SymbolTableSections[Symb.d.b];
-  return reinterpret_cast<const Elf_Sym *>(
-           base()
-           + sec->sh_offset
-           + (Symb.d.a * sec->sh_entsize));
+  return getEntry<Elf_Sym>(SymbolTableSections[Symb.d.b], Symb.d.a);
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Rel *
+ELFObjectFile<target_endianness, is64Bits>::getRel(DataRefImpl Rel) const {
+  return getEntry<Elf_Rel>(Rel.w.b, Rel.w.c);
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Rela *
+ELFObjectFile<target_endianness, is64Bits>::getRela(DataRefImpl Rela) const {
+  return getEntry<Elf_Rela>(Rela.w.b, Rela.w.c);
 }
 
 template<support::endianness target_endianness, bool is64Bits>
 const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Shdr *
 ELFObjectFile<target_endianness, is64Bits>::getSection(DataRefImpl Symb) const {
   const Elf_Shdr *sec = getSection(Symb.d.b);
-  if (sec->sh_type != ELF::SHT_SYMTAB)
+  if (sec->sh_type != ELF::SHT_SYMTAB || sec->sh_type != ELF::SHT_DYNSYM)
     // FIXME: Proper error handling.
     report_fatal_error("Invalid symbol table section!");
   return sec;
@@ -681,10 +1334,10 @@ ELFObjectFile<target_endianness, is64Bits>::getSection(DataRefImpl Symb) const {
 
 template<support::endianness target_endianness, bool is64Bits>
 const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Shdr *
-ELFObjectFile<target_endianness, is64Bits>::getSection(uint16_t index) const {
-  if (index == 0 || index >= ELF::SHN_LORESERVE)
+ELFObjectFile<target_endianness, is64Bits>::getSection(uint32_t index) const {
+  if (index == 0)
     return 0;
-  if (!SectionHeaderTable || index >= Header->e_shnum)
+  if (!SectionHeaderTable || index >= getNumSections())
     // FIXME: Proper error handling.
     report_fatal_error("Invalid section index!");
 
@@ -695,7 +1348,7 @@ ELFObjectFile<target_endianness, is64Bits>::getSection(uint16_t index) const {
 
 template<support::endianness target_endianness, bool is64Bits>
 const char *ELFObjectFile<target_endianness, is64Bits>
-                         ::getString(uint16_t section,
+                         ::getString(uint32_t section,
                                      ELF::Elf32_Word offset) const {
   return getString(getSection(section), offset);
 }
@@ -711,6 +1364,24 @@ const char *ELFObjectFile<target_endianness, is64Bits>
   return (const char *)base() + section->sh_offset + offset;
 }
 
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getSymbolName(const Elf_Sym *symb,
+                                        StringRef &Result) const {
+  if (symb->st_name == 0) {
+    const Elf_Shdr *section = getSection(symb);
+    if (!section)
+      Result = "";
+    else
+      Result = getString(dot_shstrtab_sec, section->sh_name);
+    return object_error::success;
+  }
+
+  // Use the default symbol table name section.
+  Result = getString(dot_strtab_sec, symb->st_name);
+  return object_error::success;
+}
+
 // EI_CLASS, EI_DATA.
 static std::pair<unsigned char, unsigned char>
 getElfArchType(MemoryBuffer *Object) {
diff --git a/lib/Object/MachOObject.cpp b/lib/Object/MachOObject.cpp
index 9890feb..9cdac86 100644
--- a/lib/Object/MachOObject.cpp
+++ b/lib/Object/MachOObject.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Object/MachOObject.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/SwapByteOrder.h"
@@ -244,6 +245,18 @@ void MachOObject::ReadDysymtabLoadCommand(const LoadCommandInfo &LCI,
 }
 
 template<>
+void SwapStruct(macho::LinkeditDataLoadCommand &Value) {
+  SwapValue(Value.Type);
+  SwapValue(Value.Size);
+  SwapValue(Value.DataOffset);
+  SwapValue(Value.DataSize);
+}
+void MachOObject::ReadLinkeditDataLoadCommand(const LoadCommandInfo &LCI,
+                    InMemoryStruct<macho::LinkeditDataLoadCommand> &Res) const {
+  ReadInMemoryStruct(*this, Buffer->getBuffer(), LCI.Offset, Res);
+}
+
+template<>
 void SwapStruct(macho::IndirectSymbolTableEntry &Value) {
   SwapValue(Value.Index);
 }
@@ -343,6 +356,31 @@ void MachOObject::ReadSymbol64TableEntry(uint64_t SymbolTableOffset,
   ReadInMemoryStruct(*this, Buffer->getBuffer(), Offset, Res);
 }
 
+
+void MachOObject::ReadULEB128s(uint64_t Index,
+                               SmallVectorImpl<uint64_t> &Out) const {
+  const char *ptr = Buffer->getBufferStart() + Index;
+  uint64_t data = 0;
+  uint64_t delta = 0;
+  uint32_t shift = 0;
+  while (true) {
+    assert(ptr < Buffer->getBufferEnd() && "index out of bounds");
+    assert(shift < 64 && "too big for uint64_t");
+
+    uint8_t byte = *ptr++;
+    delta |= ((byte & 0x7F) << shift);
+    shift += 7;
+    if (byte < 0x80) {
+      if (delta == 0)
+        break;
+      data += delta;
+      Out.push_back(data);
+      delta = 0;
+      shift = 0;
+    }
+  }
+}
+
 /* ** */
 // Object Dumping Facilities
 void MachOObject::dump() const { print(dbgs()); dbgs() << '\n'; }
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 26a6e13..507df58 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -13,11 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Triple.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/MachOFormat.h"
-#include "llvm/Object/MachOObject.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/MachO.h"
 
 #include <cctype>
 #include <cstring>
@@ -27,56 +25,24 @@ using namespace llvm;
 using namespace object;
 
 namespace llvm {
+namespace object {
 
-typedef MachOObject::LoadCommandInfo LoadCommandInfo;
-
-class MachOObjectFile : public ObjectFile {
-public:
-  MachOObjectFile(MemoryBuffer *Object, MachOObject *MOO, error_code &ec)
+MachOObjectFile::MachOObjectFile(MemoryBuffer *Object, MachOObject *MOO,
+                                 error_code &ec)
     : ObjectFile(Binary::isMachO, Object, ec),
       MachOObj(MOO),
-      RegisteredStringTable(std::numeric_limits<uint32_t>::max()) {}
-
-  virtual symbol_iterator begin_symbols() const;
-  virtual symbol_iterator end_symbols() const;
-  virtual section_iterator begin_sections() const;
-  virtual section_iterator end_sections() const;
-
-  virtual uint8_t getBytesInAddress() const;
-  virtual StringRef getFileFormatName() const;
-  virtual unsigned getArch() const;
-
-protected:
-  virtual error_code getSymbolNext(DataRefImpl Symb, SymbolRef &Res) const;
-  virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const;
-  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const;
-  virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const;
-  virtual error_code getSymbolNMTypeChar(DataRefImpl Symb, char &Res) const;
-  virtual error_code isSymbolInternal(DataRefImpl Symb, bool &Res) const;
-
-  virtual error_code getSectionNext(DataRefImpl Sec, SectionRef &Res) const;
-  virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const;
-  virtual error_code getSectionAddress(DataRefImpl Sec, uint64_t &Res) const;
-  virtual error_code getSectionSize(DataRefImpl Sec, uint64_t &Res) const;
-  virtual error_code getSectionContents(DataRefImpl Sec, StringRef &Res) const;
-  virtual error_code isSectionText(DataRefImpl Sec, bool &Res) const;
-  virtual error_code sectionContainsSymbol(DataRefImpl DRI, DataRefImpl S,
-                                           bool &Result) const;
-
-private:
-  MachOObject *MachOObj;
-  mutable uint32_t RegisteredStringTable;
-
-  void moveToNextSection(DataRefImpl &DRI) const;
-  void getSymbolTableEntry(DataRefImpl DRI,
-                           InMemoryStruct<macho::SymbolTableEntry> &Res) const;
-  void getSymbol64TableEntry(DataRefImpl DRI,
-                          InMemoryStruct<macho::Symbol64TableEntry> &Res) const;
-  void moveToNextSymbol(DataRefImpl &DRI) const;
-  void getSection(DataRefImpl DRI, InMemoryStruct<macho::Section> &Res) const;
-  void getSection64(DataRefImpl DRI,
-                    InMemoryStruct<macho::Section64> &Res) const;
-};
+      RegisteredStringTable(std::numeric_limits<uint32_t>::max()) {
+  DataRefImpl DRI;
+  DRI.d.a = DRI.d.b = 0;
+  moveToNextSection(DRI);
+  uint32_t LoadCommandCount = MachOObj->getHeader().NumLoadCommands;
+  while (DRI.d.a < LoadCommandCount) {
+    Sections.push_back(DRI);
+    DRI.d.b++;
+    moveToNextSection(DRI);
+  }
+}
+
 
 ObjectFile *ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer) {
   error_code ec;
@@ -158,6 +124,27 @@ error_code MachOObjectFile::getSymbolName(DataRefImpl DRI,
   return object_error::success;
 }
 
+error_code MachOObjectFile::getSymbolOffset(DataRefImpl DRI,
+                                             uint64_t &Result) const {
+  uint64_t SectionOffset;
+  uint8_t SectionIndex;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(DRI, Entry);
+    Result = Entry->Value;
+    SectionIndex = Entry->SectionIndex;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(DRI, Entry);
+    Result = Entry->Value;
+    SectionIndex = Entry->SectionIndex;
+  }
+  getSectionAddress(Sections[SectionIndex-1], SectionOffset);
+  Result -= SectionOffset;
+
+  return object_error::success;
+}
+
 error_code MachOObjectFile::getSymbolAddress(DataRefImpl DRI,
                                              uint64_t &Result) const {
   if (MachOObj->is64Bit()) {
@@ -227,7 +214,51 @@ error_code MachOObjectFile::isSymbolInternal(DataRefImpl DRI,
   return object_error::success;
 }
 
-ObjectFile::symbol_iterator MachOObjectFile::begin_symbols() const {
+error_code MachOObjectFile::isSymbolGlobal(DataRefImpl Symb, bool &Res) const {
+
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(Symb, Entry);
+    Res = Entry->Type & MachO::NlistMaskExternal;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(Symb, Entry);
+    Res = Entry->Type & MachO::NlistMaskExternal;
+  }
+  return object_error::success;
+}
+
+error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
+                                          SymbolRef::SymbolType &Res) const {
+  uint8_t n_type;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Symbol64TableEntry> Entry;
+    getSymbol64TableEntry(Symb, Entry);
+    n_type = Entry->Type;
+  } else {
+    InMemoryStruct<macho::SymbolTableEntry> Entry;
+    getSymbolTableEntry(Symb, Entry);
+    n_type = Entry->Type;
+  }
+  Res = SymbolRef::ST_Other;
+
+  // If this is a STAB debugging symbol, we can do nothing more.
+  if (n_type & MachO::NlistMaskStab)
+    return object_error::success;
+
+  switch (n_type & MachO::NlistMaskType) {
+    case MachO::NListTypeUndefined :
+      Res = SymbolRef::ST_External;
+      break;
+    case MachO::NListTypeSection :
+      Res = SymbolRef::ST_Function;
+      break;
+  }
+  return object_error::success;
+}
+
+
+symbol_iterator MachOObjectFile::begin_symbols() const {
   // DRI.d.a = segment number; DRI.d.b = symbol index.
   DataRefImpl DRI;
   DRI.d.a = DRI.d.b = 0;
@@ -235,7 +266,7 @@ ObjectFile::symbol_iterator MachOObjectFile::begin_symbols() const {
   return symbol_iterator(SymbolRef(DRI, this));
 }
 
-ObjectFile::symbol_iterator MachOObjectFile::end_symbols() const {
+symbol_iterator MachOObjectFile::end_symbols() const {
   DataRefImpl DRI;
   DRI.d.a = MachOObj->getHeader().NumLoadCommands;
   DRI.d.b = 0;
@@ -283,6 +314,13 @@ MachOObjectFile::getSection(DataRefImpl DRI,
   MachOObj->ReadSection(LCI, DRI.d.b, Res);
 }
 
+std::size_t MachOObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  SectionList::const_iterator loc =
+    std::find(Sections.begin(), Sections.end(), Sec);
+  assert(loc != Sections.end() && "Sec is not a valid section!");
+  return std::distance(Sections.begin(), loc);
+}
+
 void
 MachOObjectFile::getSection64(DataRefImpl DRI,
                             InMemoryStruct<macho::Section64> &Res) const {
@@ -371,6 +409,20 @@ error_code MachOObjectFile::getSectionContents(DataRefImpl DRI,
   return object_error::success;
 }
 
+error_code MachOObjectFile::getSectionAlignment(DataRefImpl DRI,
+                                                uint64_t &Result) const {
+  if (is64BitLoadCommand(MachOObj, DRI)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(DRI, Sect);
+    Result = uint64_t(1) << Sect->Align;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(DRI, Sect);
+    Result = uint64_t(1) << Sect->Align;
+  }
+  return object_error::success;
+}
+
 error_code MachOObjectFile::isSectionText(DataRefImpl DRI,
                                           bool &Result) const {
   if (is64BitLoadCommand(MachOObj, DRI)) {
@@ -385,35 +437,185 @@ error_code MachOObjectFile::isSectionText(DataRefImpl DRI,
   return object_error::success;
 }
 
+error_code MachOObjectFile::isSectionData(DataRefImpl DRI,
+                                          bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code MachOObjectFile::isSectionBSS(DataRefImpl DRI,
+                                         bool &Result) const {
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
 error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
                                                   DataRefImpl Symb,
                                                   bool &Result) const {
+  SymbolRef::SymbolType ST;
+  getSymbolType(Symb, ST);
+  if (ST == SymbolRef::ST_External) {
+    Result = false;
+    return object_error::success;
+  }
+
+  uint64_t SectBegin, SectEnd;
+  getSectionAddress(Sec, SectBegin);
+  getSectionSize(Sec, SectEnd);
+  SectEnd += SectBegin;
+
   if (MachOObj->is64Bit()) {
     InMemoryStruct<macho::Symbol64TableEntry> Entry;
     getSymbol64TableEntry(Symb, Entry);
-    Result = Entry->SectionIndex == 1 + Sec.d.a + Sec.d.b;
+    uint64_t SymAddr= Entry->Value;
+    Result = (SymAddr >= SectBegin) && (SymAddr < SectEnd);
   } else {
     InMemoryStruct<macho::SymbolTableEntry> Entry;
     getSymbolTableEntry(Symb, Entry);
-    Result = Entry->SectionIndex == 1 + Sec.d.a + Sec.d.b;
+    uint64_t SymAddr= Entry->Value;
+    Result = (SymAddr >= SectBegin) && (SymAddr < SectEnd);
   }
+
   return object_error::success;
 }
 
-ObjectFile::section_iterator MachOObjectFile::begin_sections() const {
+relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+  DataRefImpl ret;
+  ret.d.a = 0;
+  ret.d.b = getSectionIndex(Sec);
+  return relocation_iterator(RelocationRef(ret, this));
+}
+relocation_iterator MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+  uint32_t last_reloc;
+  if (is64BitLoadCommand(MachOObj, Sec)) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(Sec, Sect);
+    last_reloc = Sect->NumRelocationTableEntries;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(Sec, Sect);
+    last_reloc = Sect->NumRelocationTableEntries;
+  }
+  DataRefImpl ret;
+  ret.d.a = last_reloc;
+  ret.d.b = getSectionIndex(Sec);
+  return relocation_iterator(RelocationRef(ret, this));
+}
+
+section_iterator MachOObjectFile::begin_sections() const {
   DataRefImpl DRI;
   DRI.d.a = DRI.d.b = 0;
   moveToNextSection(DRI);
   return section_iterator(SectionRef(DRI, this));
 }
 
-ObjectFile::section_iterator MachOObjectFile::end_sections() const {
+section_iterator MachOObjectFile::end_sections() const {
   DataRefImpl DRI;
   DRI.d.a = MachOObj->getHeader().NumLoadCommands;
   DRI.d.b = 0;
   return section_iterator(SectionRef(DRI, this));
 }
 
+/*===-- Relocations -------------------------------------------------------===*/
+
+void MachOObjectFile::
+getRelocation(DataRefImpl Rel,
+              InMemoryStruct<macho::RelocationEntry> &Res) const {
+  uint32_t relOffset;
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(Sections[Rel.d.b], Sect);
+    relOffset = Sect->RelocationTableOffset;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(Sections[Rel.d.b], Sect);
+    relOffset = Sect->RelocationTableOffset;
+  }
+  MachOObj->ReadRelocationEntry(relOffset, Rel.d.a, Res);
+}
+error_code MachOObjectFile::getRelocationNext(DataRefImpl Rel,
+                                              RelocationRef &Res) const {
+  ++Rel.d.a;
+  Res = RelocationRef(Rel, this);
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationAddress(DataRefImpl Rel,
+                                                 uint64_t &Res) const {
+  const uint8_t* sectAddress = base();
+  if (MachOObj->is64Bit()) {
+    InMemoryStruct<macho::Section64> Sect;
+    getSection64(Sections[Rel.d.b], Sect);
+    sectAddress += Sect->Offset;
+  } else {
+    InMemoryStruct<macho::Section> Sect;
+    getSection(Sections[Rel.d.b], Sect);
+    sectAddress += Sect->Offset;
+  }
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+  Res = reinterpret_cast<uintptr_t>(sectAddress + RE->Word0);
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationSymbol(DataRefImpl Rel,
+                                                SymbolRef &Res) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+  uint32_t SymbolIdx = RE->Word1 & 0xffffff;
+  bool isExtern = (RE->Word1 >> 27) & 1;
+
+  DataRefImpl Sym;
+  Sym.d.a = Sym.d.b = 0;
+  moveToNextSymbol(Sym);
+  if (isExtern) {
+    for (unsigned i = 0; i < SymbolIdx; i++) {
+      Sym.d.b++;
+      moveToNextSymbol(Sym);
+      assert(Sym.d.a < MachOObj->getHeader().NumLoadCommands &&
+             "Relocation symbol index out of range!");
+    }
+  }
+  Res = SymbolRef(Sym, this);
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
+                                              uint32_t &Res) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+  Res = RE->Word1;
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationAdditionalInfo(DataRefImpl Rel,
+                                                        int64_t &Res) const {
+  InMemoryStruct<macho::RelocationEntry> RE;
+  getRelocation(Rel, RE);
+  bool isExtern = (RE->Word1 >> 27) & 1;
+  Res = 0;
+  if (!isExtern) {
+    const uint8_t* sectAddress = base();
+    if (MachOObj->is64Bit()) {
+      InMemoryStruct<macho::Section64> Sect;
+      getSection64(Sections[Rel.d.b], Sect);
+      sectAddress += Sect->Offset;
+    } else {
+      InMemoryStruct<macho::Section> Sect;
+      getSection(Sections[Rel.d.b], Sect);
+      sectAddress += Sect->Offset;
+    }
+    Res = reinterpret_cast<uintptr_t>(sectAddress);
+  }
+  return object_error::success;
+}
+error_code MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
+                                          SmallVectorImpl<char> &Result) const {
+  return object_error::success;
+}
+
 /*===-- Miscellaneous -----------------------------------------------------===*/
 
 uint8_t MachOObjectFile::getBytesInAddress() const {
@@ -465,5 +667,5 @@ unsigned MachOObjectFile::getArch() const {
   }
 }
 
+} // end namespace object
 } // end namespace llvm
-
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index 9a373ad..2ea8db9 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -27,8 +27,8 @@ void LLVMDisposeObjectFile(LLVMObjectFileRef ObjectFile) {
 }
 
 LLVMSectionIteratorRef LLVMGetSections(LLVMObjectFileRef ObjectFile) {
-  ObjectFile::section_iterator SI = unwrap(ObjectFile)->begin_sections();
-  return wrap(new ObjectFile::section_iterator(SI));
+  section_iterator SI = unwrap(ObjectFile)->begin_sections();
+  return wrap(new section_iterator(SI));
 }
 
 void LLVMDisposeSectionIterator(LLVMSectionIteratorRef SI) {
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index a7798df..69d8ed0 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -45,6 +45,7 @@ ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
     case sys::Mach_O_DynamicLinker_FileType:
     case sys::Mach_O_Bundle_FileType:
     case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
+    case sys::Mach_O_DSYMCompanion_FileType:
       return createMachOObjectFile(Object);
     case sys::COFF_FileType:
       return createCOFFObjectFile(Object);
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index c64da6e..f238894 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -832,6 +832,7 @@ APFloat::incrementSignificand()
 
   /* Our callers should never cause us to overflow.  */
   assert(carry == 0);
+  (void)carry;
 }
 
 /* Add the significand of the RHS.  Returns the carry flag.  */
@@ -926,6 +927,7 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
     APFloat extendedAddend(*addend);
     status = extendedAddend.convert(extendedSemantics, rmTowardZero, &ignored);
     assert(status == opOK);
+    (void)status;
     lost_fraction = addOrSubtractSignificand(extendedAddend, false);
 
     /* Restore our state.  */
@@ -1190,7 +1192,7 @@ APFloat::normalize(roundingMode rounding_mode,
 
   if (omsb) {
     /* OMSB is numbered from 1.  We want to place it in the integer
-       bit numbered PRECISON if possible, with a compensating change in
+       bit numbered PRECISION if possible, with a compensating change in
        the exponent.  */
     exponentChange = omsb - semantics->precision;
 
@@ -1389,6 +1391,7 @@ APFloat::addOrSubtractSignificand(const APFloat &rhs, bool subtract)
     /* The code above is intended to ensure that no borrow is
        necessary.  */
     assert(!carry);
+    (void)carry;
   } else {
     if (bits > 0) {
       APFloat temp_rhs(rhs);
@@ -1402,6 +1405,7 @@ APFloat::addOrSubtractSignificand(const APFloat &rhs, bool subtract)
 
     /* We have a guard bit; generating a carry cannot happen.  */
     assert(!carry);
+    (void)carry;
   }
 
   return lost_fraction;
@@ -2098,7 +2102,7 @@ APFloat::convertToInteger(APSInt &result,
   opStatus status = convertToInteger(
     parts.data(), bitWidth, result.isSigned(), rounding_mode, isExact);
   // Keeps the original signed-ness.
-  result = APInt(bitWidth, (unsigned)parts.size(), parts.data());
+  result = APInt(bitWidth, parts);
   return status;
 }
 
@@ -2121,7 +2125,7 @@ APFloat::convertFromUnsignedParts(const integerPart *src,
   dstCount = partCount();
   precision = semantics->precision;
 
-  /* We want the most significant PRECISON bits of SRC.  There may not
+  /* We want the most significant PRECISION bits of SRC.  There may not
      be that many; extract what we can.  */
   if (precision <= omsb) {
     exponent = omsb - 1;
@@ -2192,7 +2196,7 @@ APFloat::convertFromZeroExtendedInteger(const integerPart *parts,
                                         roundingMode rounding_mode)
 {
   unsigned int partCount = partCountForBits(width);
-  APInt api = APInt(width, partCount, parts);
+  APInt api = APInt(width, makeArrayRef(parts, partCount));
 
   sign = false;
   if (isSigned && APInt::tcExtractBit(parts, width - 1)) {
@@ -2746,7 +2750,7 @@ APFloat::convertF80LongDoubleAPFloatToAPInt() const
   words[0] = mysignificand;
   words[1] =  ((uint64_t)(sign & 1) << 15) |
               (myexponent & 0x7fffLL);
-  return APInt(80, 2, words);
+  return APInt(80, words);
 }
 
 APInt
@@ -2791,7 +2795,7 @@ APFloat::convertPPCDoubleDoubleAPFloatToAPInt() const
   words[1] =  ((uint64_t)(sign2 & 1) << 63) |
               ((myexponent2 & 0x7ff) <<  52) |
               (mysignificand2 & 0xfffffffffffffLL);
-  return APInt(128, 2, words);
+  return APInt(128, words);
 }
 
 APInt
@@ -2827,7 +2831,7 @@ APFloat::convertQuadrupleAPFloatToAPInt() const
              ((myexponent & 0x7fff) << 48) |
              (mysignificand2 & 0xffffffffffffLL);
 
-  return APInt(128, 2, words);
+  return APInt(128, words);
 }
 
 APInt
@@ -3239,8 +3243,9 @@ APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) {
     significand[i] = ~((integerPart) 0);
 
   // ...and then clear the top bits for internal consistency.
-  significand[N-1] &=
-    (((integerPart) 1) << ((Sem.precision % integerPartWidth) - 1)) - 1;
+  if (Sem.precision % integerPartWidth != 0)
+    significand[N-1] &=
+      (((integerPart) 1) << (Sem.precision % integerPartWidth)) - 1;
 
   return Val;
 }
@@ -3270,7 +3275,7 @@ APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) {
   Val.exponent = Sem.minExponent;
   Val.zeroSignificand();
   Val.significandParts()[partCountForBits(Sem.precision)-1] |=
-    (((integerPart) 1) << ((Sem.precision % integerPartWidth) - 1));
+    (((integerPart) 1) << ((Sem.precision - 1) % integerPartWidth));
 
   return Val;
 }
@@ -3413,8 +3418,8 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
   // Decompose the number into an APInt and an exponent.
   int exp = exponent - ((int) semantics->precision - 1);
   APInt significand(semantics->precision,
-                    partCountForBits(semantics->precision),
-                    significandParts());
+                    makeArrayRef(significandParts(),
+                                 partCountForBits(semantics->precision)));
 
   // Set FormatPrecision if zero.  We want to do this before we
   // truncate trailing zeros, as those are part of the precision.
@@ -3451,7 +3456,7 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
     //                 <= semantics->precision + e * 137 / 59
     //   (log_2(5) ~ 2.321928 < 2.322034 ~ 137/59)
 
-    unsigned precision = semantics->precision + 137 * texp / 59;
+    unsigned precision = semantics->precision + (137 * texp + 136) / 59;
 
     // Multiply significand by 5^e.
     //   N * 5^0101 == N * 5^(1*1) * 5^(0*2) * 5^(1*4) * 5^(0*8)
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 76265d4..3774c52 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -48,18 +48,20 @@ inline static uint64_t* getMemory(unsigned numWords) {
 inline static unsigned getDigit(char cdigit, uint8_t radix) {
   unsigned r;
 
-  if (radix == 16) {
+  if (radix == 16 || radix == 36) {
     r = cdigit - '0';
     if (r <= 9)
       return r;
 
     r = cdigit - 'A';
-    if (r <= 5)
+    if (r <= radix - 11U)
       return r + 10;
 
     r = cdigit - 'a';
-    if (r <= 5)
+    if (r <= radix - 11U)
       return r + 10;
+    
+    radix = 10;
   }
 
   r = cdigit - '0';
@@ -83,25 +85,33 @@ void APInt::initSlowCase(const APInt& that) {
   memcpy(pVal, that.pVal, getNumWords() * APINT_WORD_SIZE);
 }
 
-
-APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
-  : BitWidth(numBits), VAL(0) {
+void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
   assert(BitWidth && "Bitwidth too small");
-  assert(bigVal && "Null pointer detected!");
+  assert(bigVal.data() && "Null pointer detected!");
   if (isSingleWord())
     VAL = bigVal[0];
   else {
     // Get memory, cleared to 0
     pVal = getClearedMemory(getNumWords());
     // Calculate the number of words to copy
-    unsigned words = std::min<unsigned>(numWords, getNumWords());
+    unsigned words = std::min<unsigned>(bigVal.size(), getNumWords());
     // Copy the words from bigVal to pVal
-    memcpy(pVal, bigVal, words * APINT_WORD_SIZE);
+    memcpy(pVal, bigVal.data(), words * APINT_WORD_SIZE);
   }
   // Make sure unused high bits are cleared
   clearUnusedBits();
 }
 
+APInt::APInt(unsigned numBits, ArrayRef<uint64_t> bigVal)
+  : BitWidth(numBits), VAL(0) {
+  initFromArray(bigVal);
+}
+
+APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
+  : BitWidth(numBits), VAL(0) {
+  initFromArray(makeArrayRef(bigVal, numWords));
+}
+
 APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix)
   : BitWidth(numbits), VAL(0) {
   assert(BitWidth && "Bitwidth too small");
@@ -376,6 +386,7 @@ APInt& APInt::operator*=(const APInt& RHS) {
   clearAllBits();
   unsigned wordsToCopy = destWords >= getNumWords() ? getNumWords() : destWords;
   memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
+  clearUnusedBits();
 
   // delete dest array and return
   delete[] dest;
@@ -461,7 +472,7 @@ APInt APInt::operator*(const APInt& RHS) const {
     return APInt(BitWidth, VAL * RHS.VAL);
   APInt Result(*this);
   Result *= RHS;
-  return Result.clearUnusedBits();
+  return Result;
 }
 
 APInt APInt::operator+(const APInt& RHS) const {
@@ -613,8 +624,9 @@ void APInt::flipBit(unsigned bitPosition) {
 
 unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-         "Radix should be 2, 8, 10, or 16!");
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+          radix == 36) &&
+         "Radix should be 2, 8, 10, 16, or 36!");
 
   size_t slen = str.size();
 
@@ -636,6 +648,8 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   if (radix == 16)
     return slen * 4 + isNegative;
 
+  // FIXME: base 36
+  
   // This is grossly inefficient but accurate. We could probably do something
   // with a computation of roughly slen*64/20 and then adjust by the value of
   // the first few digits. But, I'm not sure how accurate that could be.
@@ -644,7 +658,9 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   // be too large. This avoids the assertion in the constructor. This
   // calculation doesn't work appropriately for the numbers 0-9, so just use 4
   // bits in that case.
-  unsigned sufficient = slen == 1 ? 4 : slen * 64/18;
+  unsigned sufficient 
+    = radix == 10? (slen == 1 ? 4 : slen * 64/18)
+                 : (slen == 1 ? 7 : slen * 16/3);
 
   // Convert to the actual binary value.
   APInt tmp(sufficient, StringRef(p, slen), radix);
@@ -2107,8 +2123,9 @@ APInt APInt::sshl_ov(unsigned ShAmt, bool &Overflow) const {
 void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
   // Check our assumptions here
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-         "Radix should be 2, 8, 10, or 16!");
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+          radix == 36) &&
+         "Radix should be 2, 8, 10, 16, or 36!");
 
   StringRef::iterator p = str.begin();
   size_t slen = str.size();
@@ -2165,7 +2182,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
 
 void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
                      bool Signed, bool formatAsCLiteral) const {
-  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2) &&
+  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 || 
+          Radix == 36) &&
          "Radix should be 2, 8, 10, or 16!");
 
   const char *Prefix = "";
@@ -2195,7 +2213,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
     return;
   }
 
-  static const char Digits[] = "0123456789ABCDEF";
+  static const char Digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
   if (isSingleWord()) {
     char Buffer[65];
@@ -2249,7 +2267,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
   // For the 2, 8 and 16 bit cases, we can just shift instead of divide
   // because the number of bits per digit (1, 3 and 4 respectively) divides
   // equaly.  We just shift until the value is zero.
-  if (Radix != 10) {
+  if (Radix == 2 || Radix == 8 || Radix == 16) {
     // Just shift tmp right for each digit width until it becomes zero
     unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
     unsigned MaskAmt = Radix - 1;
@@ -2260,7 +2278,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
       Tmp = Tmp.lshr(ShiftAmt);
     }
   } else {
-    APInt divisor(4, 10);
+    APInt divisor(Radix == 10? 4 : 8, Radix);
     while (Tmp != 0) {
       APInt APdigit(1, 0);
       APInt tmp2(Tmp.getBitWidth(), 0);
diff --git a/lib/Support/Atomic.cpp b/lib/Support/Atomic.cpp
index c7b4bff..94760cc 100644
--- a/lib/Support/Atomic.cpp
+++ b/lib/Support/Atomic.cpp
@@ -22,7 +22,7 @@ using namespace llvm;
 #endif
 
 void sys::MemoryFence() {
-#if LLVM_MULTITHREADED==0
+#if LLVM_HAS_ATOMICS == 0
   return;
 #else
 #  if defined(__GNUC__)
@@ -38,7 +38,7 @@ void sys::MemoryFence() {
 sys::cas_flag sys::CompareAndSwap(volatile sys::cas_flag* ptr,
                                   sys::cas_flag new_value,
                                   sys::cas_flag old_value) {
-#if LLVM_MULTITHREADED==0
+#if LLVM_HAS_ATOMICS == 0
   sys::cas_flag result = *ptr;
   if (result == old_value)
     *ptr = new_value;
@@ -53,7 +53,7 @@ sys::cas_flag sys::CompareAndSwap(volatile sys::cas_flag* ptr,
 }
 
 sys::cas_flag sys::AtomicIncrement(volatile sys::cas_flag* ptr) {
-#if LLVM_MULTITHREADED==0
+#if LLVM_HAS_ATOMICS == 0
   ++(*ptr);
   return *ptr;
 #elif defined(__GNUC__)
@@ -66,7 +66,7 @@ sys::cas_flag sys::AtomicIncrement(volatile sys::cas_flag* ptr) {
 }
 
 sys::cas_flag sys::AtomicDecrement(volatile sys::cas_flag* ptr) {
-#if LLVM_MULTITHREADED==0
+#if LLVM_HAS_ATOMICS == 0
   --(*ptr);
   return *ptr;
 #elif defined(__GNUC__)
@@ -79,7 +79,7 @@ sys::cas_flag sys::AtomicDecrement(volatile sys::cas_flag* ptr) {
 }
 
 sys::cas_flag sys::AtomicAdd(volatile sys::cas_flag* ptr, sys::cas_flag val) {
-#if LLVM_MULTITHREADED==0
+#if LLVM_HAS_ATOMICS == 0
   *ptr += val;
   return *ptr;
 #elif defined(__GNUC__)
diff --git a/lib/Support/BlockFrequency.cpp b/lib/Support/BlockFrequency.cpp
new file mode 100644
index 0000000..a63bf83
--- /dev/null
+++ b/lib/Support/BlockFrequency.cpp
@@ -0,0 +1,126 @@
+//====--------------- lib/Support/BlockFrequency.cpp -----------*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Block Frequency class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+/// mult96bit - Multiply FREQ by N and store result in W array.
+void mult96bit(uint64_t freq, uint32_t N, uint64_t W[2]) {
+  uint64_t u0 = freq & UINT32_MAX;
+  uint64_t u1 = freq >> 32;
+
+  // Represent 96-bit value as w[2]:w[1]:w[0];
+  uint32_t w[3] = { 0, 0, 0 };
+
+  uint64_t t = u0 * N;
+  uint64_t k = t >> 32;
+  w[0] = t;
+  t = u1 * N + k;
+  w[1] = t;
+  w[2] = t >> 32;
+
+  // W[1] - higher bits.
+  // W[0] - lower bits.
+  W[0] = w[0] + ((uint64_t) w[1] << 32);
+  W[1] = w[2];
+}
+
+
+/// div96bit - Divide 96-bit value stored in W array by D. Return 64-bit frequency.
+uint64_t div96bit(uint64_t W[2], uint32_t D) {
+  uint64_t y = W[0];
+  uint64_t x = W[1];
+  int i;
+
+  for (i = 1; i <= 64 && x; ++i) {
+    uint32_t t = (int)x >> 31;
+    x = (x << 1) | (y >> 63);
+    y = y << 1;
+    if ((x | t) >= D) {
+      x -= D;
+      ++y;
+    }
+  }
+
+  return y << (64 - i + 1);
+}
+
+}
+
+
+BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
+  uint32_t n = Prob.getNumerator();
+  uint32_t d = Prob.getDenominator();
+
+  assert(n <= d && "Probability must be less or equal to 1.");
+
+  // If we can overflow use 96-bit operations.
+  if (n > 0 && Frequency > UINT64_MAX / n) {
+    // 96-bit value represented as W[1]:W[0].
+    uint64_t W[2];
+
+    // Probability is less or equal to 1 which means that results must fit
+    // 64-bit.
+    mult96bit(Frequency, n, W);
+    Frequency = div96bit(W, d);
+    return *this;
+  }
+
+  Frequency *= n;
+  Frequency /= d;
+  return *this;
+}
+
+const BlockFrequency
+BlockFrequency::operator*(const BranchProbability &Prob) const {
+  BlockFrequency Freq(Frequency);
+  Freq *= Prob;
+  return Freq;
+}
+
+BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) {
+  uint64_t Before = Freq.Frequency;
+  Frequency += Freq.Frequency;
+
+  // If overflow, set frequency to the maximum value.
+  if (Frequency < Before)
+    Frequency = UINT64_MAX;
+
+  return *this;
+}
+
+const BlockFrequency
+BlockFrequency::operator+(const BlockFrequency &Prob) const {
+  BlockFrequency Freq(Frequency);
+  Freq += Prob;
+  return Freq;
+}
+
+void BlockFrequency::print(raw_ostream &OS) const {
+  OS << Frequency;
+}
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const BlockFrequency &Freq) {
+  Freq.print(OS);
+  return OS;
+}
+
+}
diff --git a/lib/Support/BranchProbability.cpp b/lib/Support/BranchProbability.cpp
index 97342da..49d04ed 100644
--- a/lib/Support/BranchProbability.cpp
+++ b/lib/Support/BranchProbability.cpp
@@ -24,9 +24,8 @@ BranchProbability::BranchProbability(uint32_t n, uint32_t d) {
   D = d;
 }
 
-raw_ostream &BranchProbability::print(raw_ostream &OS) const {
+void BranchProbability::print(raw_ostream &OS) const {
   OS << N << " / " << D << " = " << ((double)N / D);
-  return OS;
 }
 
 void BranchProbability::dump() const {
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 867d930..63a833c 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -9,11 +9,13 @@ add_llvm_library(LLVMSupport
   APInt.cpp
   APSInt.cpp
   Allocator.cpp
+  BlockFrequency.cpp
   BranchProbability.cpp
   circular_raw_ostream.cpp
   CommandLine.cpp
   ConstantRange.cpp
   CrashRecoveryContext.cpp
+  DataExtractor.cpp
   Debug.cpp
   DeltaAlgorithm.cpp
   DAGDeltaAlgorithm.cpp
@@ -42,7 +44,6 @@ add_llvm_library(LLVMSupport
   StringPool.cpp
   StringRef.cpp
   SystemUtils.cpp
-  TargetRegistry.cpp
   Timer.cpp
   ToolOutputFile.cpp
   Triple.cpp
@@ -72,6 +73,7 @@ add_llvm_library(LLVMSupport
   SearchForAddressOfSpecialSymbol.cpp
   Signals.cpp
   system_error.cpp
+  TargetRegistry.cpp
   ThreadLocal.cpp
   Threading.cpp
   TimeValue.cpp
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 2914337..238adcc 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/Path.h"
 #include "llvm/ADT/OwningPtr.h"
@@ -45,6 +44,7 @@ TEMPLATE_INSTANTIATION(class basic_parser<bool>);
 TEMPLATE_INSTANTIATION(class basic_parser<boolOrDefault>);
 TEMPLATE_INSTANTIATION(class basic_parser<int>);
 TEMPLATE_INSTANTIATION(class basic_parser<unsigned>);
+TEMPLATE_INSTANTIATION(class basic_parser<unsigned long long>);
 TEMPLATE_INSTANTIATION(class basic_parser<double>);
 TEMPLATE_INSTANTIATION(class basic_parser<float>);
 TEMPLATE_INSTANTIATION(class basic_parser<std::string>);
@@ -63,6 +63,7 @@ void parser<bool>::anchor() {}
 void parser<boolOrDefault>::anchor() {}
 void parser<int>::anchor() {}
 void parser<unsigned>::anchor() {}
+void parser<unsigned long long>::anchor() {}
 void parser<double>::anchor() {}
 void parser<float>::anchor() {}
 void parser<std::string>::anchor() {}
@@ -1006,6 +1007,16 @@ bool parser<unsigned>::parse(Option &O, StringRef ArgName,
   return false;
 }
 
+// parser<unsigned long long> implementation
+//
+bool parser<unsigned long long>::parse(Option &O, StringRef ArgName,
+                                      StringRef Arg, unsigned long long &Value){
+
+  if (Arg.getAsInteger(0, Value))
+    return O.error("'" + Arg + "' value invalid for uint argument!");
+  return false;
+}
+
 // parser<double>/parser<float> implementation
 //
 static bool parseDouble(Option &O, StringRef Arg, double &Value) {
@@ -1151,6 +1162,7 @@ PRINT_OPT_DIFF(bool)
 PRINT_OPT_DIFF(boolOrDefault)
 PRINT_OPT_DIFF(int)
 PRINT_OPT_DIFF(unsigned)
+PRINT_OPT_DIFF(unsigned long long)
 PRINT_OPT_DIFF(double)
 PRINT_OPT_DIFF(float)
 PRINT_OPT_DIFF(char)
@@ -1330,10 +1342,7 @@ void cl::PrintOptionValues() {
 
 static void (*OverrideVersionPrinter)() = 0;
 
-static int TargetArraySortFn(const void *LHS, const void *RHS) {
-  typedef std::pair<const char *, const Target*> pair_ty;
-  return strcmp(((const pair_ty*)LHS)->first, ((const pair_ty*)RHS)->first);
-}
+static std::vector<void (*)()>* ExtraVersionPrinters = 0;
 
 namespace {
 class VersionPrinter {
@@ -1361,37 +1370,27 @@ public:
        << "  Built " << __DATE__ << " (" << __TIME__ << ").\n"
 #endif
        << "  Host: " << sys::getHostTriple() << '\n'
-       << "  Host CPU: " << CPU << '\n'
-       << '\n'
-       << "  Registered Targets:\n";
-
-    std::vector<std::pair<const char *, const Target*> > Targets;
-    size_t Width = 0;
-    for (TargetRegistry::iterator it = TargetRegistry::begin(),
-           ie = TargetRegistry::end(); it != ie; ++it) {
-      Targets.push_back(std::make_pair(it->getName(), &*it));
-      Width = std::max(Width, strlen(Targets.back().first));
-    }
-    if (!Targets.empty())
-      qsort(&Targets[0], Targets.size(), sizeof(Targets[0]),
-            TargetArraySortFn);
-
-    for (unsigned i = 0, e = Targets.size(); i != e; ++i) {
-      OS << "    " << Targets[i].first;
-      OS.indent(Width - strlen(Targets[i].first)) << " - "
-             << Targets[i].second->getShortDescription() << '\n';
-    }
-    if (Targets.empty())
-      OS << "    (none)\n";
+       << "  Host CPU: " << CPU << '\n';
   }
   void operator=(bool OptionWasSpecified) {
     if (!OptionWasSpecified) return;
 
-    if (OverrideVersionPrinter == 0) {
-      print();
+    if (OverrideVersionPrinter != 0) {
+      (*OverrideVersionPrinter)();
       exit(1);
     }
-    (*OverrideVersionPrinter)();
+    print();
+
+    // Iterate over any registered extra printers and call them to add further
+    // information.
+    if (ExtraVersionPrinters != 0) {
+      outs() << '\n';
+      for (std::vector<void (*)()>::iterator I = ExtraVersionPrinters->begin(),
+                                             E = ExtraVersionPrinters->end();
+           I != E; ++I)
+        (*I)();
+    }
+
     exit(1);
   }
 };
@@ -1424,3 +1423,10 @@ void cl::PrintVersionMessage() {
 void cl::SetVersionPrinter(void (*func)()) {
   OverrideVersionPrinter = func;
 }
+
+void cl::AddExtraVersionPrinter(void (*func)()) {
+  if (ExtraVersionPrinters == 0)
+    ExtraVersionPrinters = new std::vector<void (*)()>;
+
+  ExtraVersionPrinters->push_back(func);
+}
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index 81382d0..c29cb53 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -21,11 +21,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Constants.h"
+#include "llvm/InstrTypes.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Instructions.h"
 using namespace llvm;
 
 /// Initialize a full (the default) or empty set for the specified type.
@@ -56,56 +55,56 @@ ConstantRange ConstantRange::makeICmpRegion(unsigned Pred,
 
   uint32_t W = CR.getBitWidth();
   switch (Pred) {
-    default: assert(!"Invalid ICmp predicate to makeICmpRegion()");
-    case ICmpInst::ICMP_EQ:
+    default: assert(0 && "Invalid ICmp predicate to makeICmpRegion()");
+    case CmpInst::ICMP_EQ:
       return CR;
-    case ICmpInst::ICMP_NE:
+    case CmpInst::ICMP_NE:
       if (CR.isSingleElement())
         return ConstantRange(CR.getUpper(), CR.getLower());
       return ConstantRange(W);
-    case ICmpInst::ICMP_ULT: {
+    case CmpInst::ICMP_ULT: {
       APInt UMax(CR.getUnsignedMax());
       if (UMax.isMinValue())
         return ConstantRange(W, /* empty */ false);
       return ConstantRange(APInt::getMinValue(W), UMax);
     }
-    case ICmpInst::ICMP_SLT: {
+    case CmpInst::ICMP_SLT: {
       APInt SMax(CR.getSignedMax());
       if (SMax.isMinSignedValue())
         return ConstantRange(W, /* empty */ false);
       return ConstantRange(APInt::getSignedMinValue(W), SMax);
     }
-    case ICmpInst::ICMP_ULE: {
+    case CmpInst::ICMP_ULE: {
       APInt UMax(CR.getUnsignedMax());
       if (UMax.isMaxValue())
         return ConstantRange(W);
       return ConstantRange(APInt::getMinValue(W), UMax + 1);
     }
-    case ICmpInst::ICMP_SLE: {
+    case CmpInst::ICMP_SLE: {
       APInt SMax(CR.getSignedMax());
       if (SMax.isMaxSignedValue())
         return ConstantRange(W);
       return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
     }
-    case ICmpInst::ICMP_UGT: {
+    case CmpInst::ICMP_UGT: {
       APInt UMin(CR.getUnsignedMin());
       if (UMin.isMaxValue())
         return ConstantRange(W, /* empty */ false);
       return ConstantRange(UMin + 1, APInt::getNullValue(W));
     }
-    case ICmpInst::ICMP_SGT: {
+    case CmpInst::ICMP_SGT: {
       APInt SMin(CR.getSignedMin());
       if (SMin.isMaxSignedValue())
         return ConstantRange(W, /* empty */ false);
       return ConstantRange(SMin + 1, APInt::getSignedMinValue(W));
     }
-    case ICmpInst::ICMP_UGE: {
+    case CmpInst::ICMP_UGE: {
       APInt UMin(CR.getUnsignedMin());
       if (UMin.isMinValue())
         return ConstantRange(W);
       return ConstantRange(UMin, APInt::getNullValue(W));
     }
-    case ICmpInst::ICMP_SGE: {
+    case CmpInst::ICMP_SGE: {
       APInt SMin(CR.getSignedMin());
       if (SMin.isMinSignedValue())
         return ConstantRange(W);
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index 899c389..263114c 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/ThreadLocal.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <setjmp.h>
 #include <cstdio>
 using namespace llvm;
@@ -123,7 +124,56 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
 
 #ifdef LLVM_ON_WIN32
 
-// FIXME: No real Win32 implementation currently.
+#include "Windows/Windows.h"
+
+// On Windows, we can make use of vectored exception handling to
+// catch most crashing situations.  Note that this does mean
+// we will be alerted of exceptions *before* structured exception
+// handling has the opportunity to catch it.  But that isn't likely
+// to cause problems because nowhere in the project is SEH being
+// used.
+//
+// Vectored exception handling is built on top of SEH, and so it
+// works on a per-thread basis.
+//
+// The vectored exception handler functionality was added in Windows
+// XP, so if support for older versions of Windows is required,
+// it will have to be added.
+//
+// If we want to support as far back as Win2k, we could use the
+// SetUnhandledExceptionFilter API, but there's a risk of that
+// being entirely overwritten (it's not a chain).
+
+static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
+{
+  // Lookup the current thread local recovery object.
+  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+
+  if (!CRCI) {
+    // Something has gone horribly wrong, so let's just tell everyone
+    // to keep searching
+    CrashRecoveryContext::Disable();
+    return EXCEPTION_CONTINUE_SEARCH;
+  }
+
+  // TODO: We can capture the stack backtrace here and store it on the
+  // implementation if we so choose.
+
+  // Handle the crash
+  const_cast<CrashRecoveryContextImpl*>(CRCI)->HandleCrash();
+
+  // Note that we don't actually get here because HandleCrash calls
+  // longjmp, which means the HandleCrash function never returns.
+  llvm_unreachable("Handled the crash, should have longjmp'ed out of here");
+  return EXCEPTION_CONTINUE_SEARCH;
+}
+
+// Because the Enable and Disable calls are static, it means that
+// there may not actually be an Impl available, or even a current
+// CrashRecoveryContext at all.  So we make use of a thread-local
+// exception table.  The handles contained in here will either be
+// non-NULL, valid VEH handles, or NULL.
+static sys::ThreadLocal<const void> sCurrentExceptionHandle;
 
 void CrashRecoveryContext::Enable() {
   sys::ScopedLock L(gCrashRecoveryContexMutex);
@@ -132,6 +182,13 @@ void CrashRecoveryContext::Enable() {
     return;
 
   gCrashRecoveryEnabled = true;
+
+  // We can set up vectored exception handling now.  We will install our
+  // handler as the front of the list, though there's no assurances that
+  // it will remain at the front (another call could install itself before
+  // our handler).  This 1) isn't likely, and 2) shouldn't cause problems.
+  PVOID handle = ::AddVectoredExceptionHandler(1, ExceptionHandler);
+  sCurrentExceptionHandle.set(handle);
 }
 
 void CrashRecoveryContext::Disable() {
@@ -141,6 +198,15 @@ void CrashRecoveryContext::Disable() {
     return;
 
   gCrashRecoveryEnabled = false;
+
+  PVOID currentHandle = const_cast<PVOID>(sCurrentExceptionHandle.get());
+  if (currentHandle) {
+    // Now we can remove the vectored exception handler from the chain
+    ::RemoveVectoredExceptionHandler(currentHandle);
+
+    // Reset the handle in our thread-local set.
+    sCurrentExceptionHandle.set(NULL);
+  }
 }
 
 #else
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
new file mode 100644
index 0000000..b946c1d
--- /dev/null
+++ b/lib/Support/DataExtractor.cpp
@@ -0,0 +1,175 @@
+//===-- DataExtractor.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/SwapByteOrder.h"
+using namespace llvm;
+
+template <typename T>
+static T getU(uint32_t *offset_ptr, const DataExtractor *de,
+              bool isLittleEndian, const char *Data) {
+  T val = 0;
+  uint32_t offset = *offset_ptr;
+  if (de->isValidOffsetForDataOfSize(offset, sizeof(val))) {
+    std::memcpy(&val, &Data[offset], sizeof(val));
+    if (sys::isLittleEndianHost() != isLittleEndian)
+      val = sys::SwapByteOrder(val);
+
+    // Advance the offset
+    *offset_ptr += sizeof(val);
+  }
+  return val;
+}
+
+template <typename T>
+static T *getUs(uint32_t *offset_ptr, T *dst, uint32_t count,
+                const DataExtractor *de, bool isLittleEndian, const char *Data){
+  uint32_t offset = *offset_ptr;
+
+  if (count > 0 && de->isValidOffsetForDataOfSize(offset, sizeof(*dst)*count)) {
+    for (T *value_ptr = dst, *end = dst + count; value_ptr != end;
+        ++value_ptr, offset += sizeof(*dst))
+      *value_ptr = getU<T>(offset_ptr, de, isLittleEndian, Data);
+    // Advance the offset
+    *offset_ptr = offset;
+    // Return a non-NULL pointer to the converted data as an indicator of
+    // success
+    return dst;
+  }
+  return NULL;
+}
+
+uint8_t DataExtractor::getU8(uint32_t *offset_ptr) const {
+  return getU<uint8_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint8_t *
+DataExtractor::getU8(uint32_t *offset_ptr, uint8_t *dst, uint32_t count) const {
+  return getUs<uint8_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                       Data.data());
+}
+
+
+uint16_t DataExtractor::getU16(uint32_t *offset_ptr) const {
+  return getU<uint16_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint16_t *DataExtractor::getU16(uint32_t *offset_ptr, uint16_t *dst,
+                                uint32_t count) const {
+  return getUs<uint16_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                        Data.data());
+}
+
+uint32_t DataExtractor::getU32(uint32_t *offset_ptr) const {
+  return getU<uint32_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint32_t *DataExtractor::getU32(uint32_t *offset_ptr, uint32_t *dst,
+                                uint32_t count) const {
+  return getUs<uint32_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                        Data.data());;
+}
+
+uint64_t DataExtractor::getU64(uint32_t *offset_ptr) const {
+  return getU<uint64_t>(offset_ptr, this, IsLittleEndian, Data.data());
+}
+
+uint64_t *DataExtractor::getU64(uint32_t *offset_ptr, uint64_t *dst,
+                                uint32_t count) const {
+  return getUs<uint64_t>(offset_ptr, dst, count, this, IsLittleEndian,
+                        Data.data());
+}
+
+uint64_t
+DataExtractor::getUnsigned(uint32_t *offset_ptr, uint32_t byte_size) const {
+  switch (byte_size) {
+  case 1:
+    return getU8(offset_ptr);
+  case 2:
+    return getU16(offset_ptr);
+  case 4:
+    return getU32(offset_ptr);
+  case 8:
+    return getU64(offset_ptr);
+  }
+  llvm_unreachable("getUnsigned unhandled case!");
+}
+
+int64_t
+DataExtractor::getSigned(uint32_t *offset_ptr, uint32_t byte_size) const {
+  switch (byte_size) {
+  case 1:
+    return (int8_t)getU8(offset_ptr);
+  case 2:
+    return (int16_t)getU16(offset_ptr);
+  case 4:
+    return (int32_t)getU32(offset_ptr);
+  case 8:
+    return (int64_t)getU64(offset_ptr);
+  }
+  llvm_unreachable("getSigned unhandled case!");
+}
+
+const char *DataExtractor::getCStr(uint32_t *offset_ptr) const {
+  uint32_t offset = *offset_ptr;
+  StringRef::size_type pos = Data.find('\0', offset);
+  if (pos != StringRef::npos) {
+    *offset_ptr = pos + 1;
+    return Data.data() + offset;
+  }
+  return NULL;
+}
+
+uint64_t DataExtractor::getULEB128(uint32_t *offset_ptr) const {
+  uint64_t result = 0;
+  if (Data.empty())
+    return 0;
+
+  unsigned shift = 0;
+  uint32_t offset = *offset_ptr;
+  uint8_t byte = 0;
+
+  while (isValidOffset(offset)) {
+    byte = Data[offset++];
+    result |= (byte & 0x7f) << shift;
+    shift += 7;
+    if ((byte & 0x80) == 0)
+      break;
+  }
+
+  *offset_ptr = offset;
+  return result;
+}
+
+int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
+  int64_t result = 0;
+  if (Data.empty())
+    return 0;
+
+  unsigned shift = 0;
+  uint32_t offset = *offset_ptr;
+  uint8_t byte = 0;
+
+  while (isValidOffset(offset)) {
+    byte = Data[offset++];
+    result |= (byte & 0x7f) << shift;
+    shift += 7;
+    if ((byte & 0x80) == 0)
+      break;
+  }
+
+  // Sign bit of byte is 2nd high order bit (0x40)
+  if (shift < 64 && (byte & 0x40))
+    result |= -(1 << shift);
+
+  *offset_ptr = offset;
+  return result;
+}
diff --git a/lib/Support/Disassembler.cpp b/lib/Support/Disassembler.cpp
index 6362aff..c6d73bc 100644
--- a/lib/Support/Disassembler.cpp
+++ b/lib/Support/Disassembler.cpp
@@ -1,4 +1,4 @@
-//===- lib/System/Disassembler.cpp ------------------------------*- C++ -*-===//
+//===- lib/Support/Disassembler.cpp -----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 0813321..95a9550 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -82,6 +82,19 @@ const char *llvm::dwarf::TagString(unsigned Tag) {
   case DW_TAG_arg_variable:              return "DW_TAG_arg_variable";
   case DW_TAG_return_variable:           return "DW_TAG_return_variable";
   case DW_TAG_vector_type:               return "DW_TAG_vector_type";
+  case DW_TAG_rvalue_reference_type:     return "DW_TAG_rvalue_reference_type";
+  case DW_TAG_template_alias:            return "DW_TAG_template_alias";
+  case DW_TAG_MIPS_loop:                 return "DW_TAG_MIPS_loop";
+  case DW_TAG_type_unit:                 return "DW_TAG_type_unit";
+  case DW_TAG_format_label:              return "DW_TAG_format_label";
+  case DW_TAG_function_template:         return "DW_TAG_function_template";
+  case DW_TAG_class_template:            return "DW_TAG_class_template";
+  case DW_TAG_GNU_template_template_param:
+    return "DW_TAG_GNU_template_template_param";
+  case DW_TAG_GNU_template_parameter_pack:
+    return "DW_TAG_GNU_template_parameter_pack";
+  case DW_TAG_GNU_formal_parameter_pack:
+    return "DW_TAG_GNU_formal_parameter_pack";
   }
   return 0;
 }
@@ -186,7 +199,30 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_elemental:                  return "DW_AT_elemental";
   case DW_AT_pure:                       return "DW_AT_pure";
   case DW_AT_recursive:                  return "DW_AT_recursive";
+  case DW_AT_signature:                  return "DW_AT_signature";
+  case DW_AT_main_subprogram:            return "DW_AT_main_subprogram";
+  case DW_AT_data_bit_offset:            return "DW_AT_data_bit_offset";
+  case DW_AT_const_expr:                 return "DW_AT_const_expr";
+  case DW_AT_enum_class:                 return "DW_AT_enum_class";
+  case DW_AT_linkage_name:               return "DW_AT_linkage_name";
+  case DW_AT_MIPS_loop_begin:            return "DW_AT_MIPS_loop_begin";
+  case DW_AT_MIPS_tail_loop_begin:       return "DW_AT_MIPS_tail_loop_begin";
+  case DW_AT_MIPS_epilog_begin:          return "DW_AT_MIPS_epilog_begin";
+  case DW_AT_MIPS_loop_unroll_factor:    return "DW_AT_MIPS_loop_unroll_factor";
+  case DW_AT_MIPS_software_pipeline_depth:
+    return "DW_AT_MIPS_software_pipeline_depth";
   case DW_AT_MIPS_linkage_name:          return "DW_AT_MIPS_linkage_name";
+  case DW_AT_MIPS_stride:                return "DW_AT_MIPS_stride";
+  case DW_AT_MIPS_abstract_name:         return "DW_AT_MIPS_abstract_name";
+  case DW_AT_MIPS_clone_origin:          return "DW_AT_MIPS_clone_origin";
+  case DW_AT_MIPS_has_inlines:           return "DW_AT_MIPS_has_inlines";
+  case DW_AT_MIPS_stride_byte:           return "DW_AT_MIPS_stride_byte";
+  case DW_AT_MIPS_stride_elem:           return "DW_AT_MIPS_stride_elem";
+  case DW_AT_MIPS_ptr_dopetype:          return "DW_AT_MIPS_ptr_dopetype";
+  case DW_AT_MIPS_allocatable_dopetype:
+    return "DW_AT_MIPS_allocatable_dopetype";
+  case DW_AT_MIPS_assumed_shape_dopetype:
+    return "DW_AT_MIPS_assumed_shape_dopetype";
   case DW_AT_sf_names:                   return "DW_AT_sf_names";
   case DW_AT_src_info:                   return "DW_AT_src_info";
   case DW_AT_mac_info:                   return "DW_AT_mac_info";
@@ -194,6 +230,8 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_body_begin:                 return "DW_AT_body_begin";
   case DW_AT_body_end:                   return "DW_AT_body_end";
   case DW_AT_GNU_vector:                 return "DW_AT_GNU_vector";
+  case DW_AT_GNU_template_name:          return "DW_AT_GNU_template_name";
+  case DW_AT_MIPS_assumed_size:          return "DW_AT_MIPS_assumed_size";
   case DW_AT_lo_user:                    return "DW_AT_lo_user";
   case DW_AT_hi_user:                    return "DW_AT_hi_user";
   case DW_AT_APPLE_optimized:            return "DW_AT_APPLE_optimized";
@@ -237,6 +275,10 @@ const char *llvm::dwarf::FormEncodingString(unsigned Encoding) {
   case DW_FORM_ref8:                     return "DW_FORM_ref8";
   case DW_FORM_ref_udata:                return "DW_FORM_ref_udata";
   case DW_FORM_indirect:                 return "DW_FORM_indirect";
+  case DW_FORM_sec_offset:               return "DW_FORM_sec_offset";
+  case DW_FORM_exprloc:                  return "DW_FORM_exprloc";
+  case DW_FORM_flag_present:             return "DW_FORM_flag_present";
+  case DW_FORM_ref_sig8:                 return "DW_FORM_ref_sig8";
   }
   return 0;
 }
@@ -397,6 +439,8 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   case DW_OP_form_tls_address:           return "DW_OP_form_tls_address";
   case DW_OP_call_frame_cfa:             return "DW_OP_call_frame_cfa";
   case DW_OP_bit_piece:                  return "DW_OP_bit_piece";
+  case DW_OP_implicit_value:             return "DW_OP_implicit_value";
+  case DW_OP_stack_value:                return "DW_OP_stack_value";
   case DW_OP_lo_user:                    return "DW_OP_lo_user";
   case DW_OP_hi_user:                    return "DW_OP_hi_user";
   }
@@ -416,6 +460,7 @@ const char *llvm::dwarf::AttributeEncodingString(unsigned Encoding) {
   case DW_ATE_unsigned:                  return "DW_ATE_unsigned";
   case DW_ATE_unsigned_char:             return "DW_ATE_unsigned_char";
   case DW_ATE_imaginary_float:           return "DW_ATE_imaginary_float";
+  case DW_ATE_UTF:                       return "DW_ATE_UTF";
   case DW_ATE_packed_decimal:            return "DW_ATE_packed_decimal";
   case DW_ATE_numeric_string:            return "DW_ATE_numeric_string";
   case DW_ATE_edited:                    return "DW_ATE_edited";
@@ -602,6 +647,7 @@ const char *llvm::dwarf::LNExtendedString(unsigned Encoding) {
   case DW_LNE_end_sequence:              return "DW_LNE_end_sequence";
   case DW_LNE_set_address:               return "DW_LNE_set_address";
   case DW_LNE_define_file:               return "DW_LNE_define_file";
+  case DW_LNE_set_discriminator:         return "DW_LNE_set_discriminator";
   case DW_LNE_lo_user:                   return "DW_LNE_lo_user";
   case DW_LNE_hi_user:                   return "DW_LNE_hi_user";
   }
@@ -651,6 +697,9 @@ const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   case DW_CFA_val_offset:                return "DW_CFA_val_offset";
   case DW_CFA_val_offset_sf:             return "DW_CFA_val_offset_sf";
   case DW_CFA_val_expression:            return "DW_CFA_val_expression";
+  case DW_CFA_MIPS_advance_loc8:         return "DW_CFA_MIPS_advance_loc8";
+  case DW_CFA_GNU_window_save:           return "DW_CFA_GNU_window_save";
+  case DW_CFA_GNU_args_size:             return "DW_CFA_GNU_args_size";
   case DW_CFA_lo_user:                   return "DW_CFA_lo_user";
   case DW_CFA_hi_user:                   return "DW_CFA_hi_user";
   }
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index 455c380..fb02c07 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -9,28 +9,26 @@
 //
 //  This header file implements the operating system DynamicLibrary concept.
 //
-// FIXME: This file leaks the ExplicitSymbols and OpenedHandles vector, and is
-// not thread safe!
+// FIXME: This file leaks ExplicitSymbols and OpenedHandles!
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Config/config.h"
 #include <cstdio>
 #include <cstring>
-#include <map>
-#include <vector>
 
 // Collection of symbol name/value pairs to be searched prior to any libraries.
-static std::map<std::string, void*> *ExplicitSymbols = 0;
+static llvm::StringMap<void *> *ExplicitSymbols = 0;
 
 namespace {
 
 struct ExplicitSymbolsDeleter {
   ~ExplicitSymbolsDeleter() {
-    if (ExplicitSymbols)
-      delete ExplicitSymbols;
+    delete ExplicitSymbols;
   }
 };
 
@@ -38,13 +36,22 @@ struct ExplicitSymbolsDeleter {
 
 static ExplicitSymbolsDeleter Dummy;
 
-void llvm::sys::DynamicLibrary::AddSymbol(const char* symbolName,
+
+static llvm::sys::SmartMutex<true>& getMutex() {
+  static llvm::sys::SmartMutex<true> HandlesMutex;
+  return HandlesMutex;
+}
+
+void llvm::sys::DynamicLibrary::AddSymbol(StringRef symbolName,
                                           void *symbolValue) {
+  SmartScopedLock<true> lock(getMutex());
   if (ExplicitSymbols == 0)
-    ExplicitSymbols = new std::map<std::string, void*>();
+    ExplicitSymbols = new llvm::StringMap<void*>();
   (*ExplicitSymbols)[symbolName] = symbolValue;
 }
 
+char llvm::sys::DynamicLibrary::Invalid = 0;
+
 #ifdef LLVM_ON_WIN32
 
 #include "Windows/DynamicLibrary.inc"
@@ -61,66 +68,78 @@ using namespace llvm::sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-static std::vector<void *> *OpenedHandles = 0;
-
-
-static SmartMutex<true>& getMutex() {
-  static SmartMutex<true> HandlesMutex;
-  return HandlesMutex;
-}
+static DenseSet<void *> *OpenedHandles = 0;
 
+DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
+                                                   std::string *errMsg) {
+  SmartScopedLock<true> lock(getMutex());
 
-bool DynamicLibrary::LoadLibraryPermanently(const char *Filename,
-                                            std::string *ErrMsg) {
-  void *H = dlopen(Filename, RTLD_LAZY|RTLD_GLOBAL);
-  if (H == 0) {
-    if (ErrMsg) *ErrMsg = dlerror();
-    return true;
+  void *handle = dlopen(filename, RTLD_LAZY|RTLD_GLOBAL);
+  if (handle == 0) {
+    if (errMsg) *errMsg = dlerror();
+    return DynamicLibrary();
   }
+
 #ifdef __CYGWIN__
   // Cygwin searches symbols only in the main
   // with the handle of dlopen(NULL, RTLD_GLOBAL).
-  if (Filename == NULL)
-    H = RTLD_DEFAULT;
+  if (filename == NULL)
+    handle = RTLD_DEFAULT;
 #endif
-  SmartScopedLock<true> Lock(getMutex());
+
   if (OpenedHandles == 0)
-    OpenedHandles = new std::vector<void *>();
-  OpenedHandles->push_back(H);
-  return false;
+    OpenedHandles = new DenseSet<void *>();
+
+  // If we've already loaded this library, dlclose() the handle in order to
+  // keep the internal refcount at +1.
+  if (!OpenedHandles->insert(handle).second)
+    dlclose(handle);
+
+  return DynamicLibrary(handle);
+}
+
+void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+  if (!isValid())
+    return NULL;
+  return dlsym(Data, symbolName);
 }
+
 #else
 
 using namespace llvm;
 using namespace llvm::sys;
 
-bool DynamicLibrary::LoadLibraryPermanently(const char *Filename,
-                                            std::string *ErrMsg) {
-  if (ErrMsg) *ErrMsg = "dlopen() not supported on this platform";
-  return true;
+DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
+                                                   std::string *errMsg) {
+  if (errMsg) *errMsg = "dlopen() not supported on this platform";
+  return DynamicLibrary();
 }
+
+void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+  return NULL;
+}
+
 #endif
 
 namespace llvm {
 void *SearchForAddressOfSpecialSymbol(const char* symbolName);
 }
 
-void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
+void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
+  SmartScopedLock<true> Lock(getMutex());
+
   // First check symbols added via AddSymbol().
   if (ExplicitSymbols) {
-    std::map<std::string, void *>::iterator I =
-      ExplicitSymbols->find(symbolName);
-    std::map<std::string, void *>::iterator E = ExplicitSymbols->end();
+    StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
 
-    if (I != E)
-      return I->second;
+    if (i != ExplicitSymbols->end())
+      return i->second;
   }
 
 #if HAVE_DLFCN_H
   // Now search the libraries.
-  SmartScopedLock<true> Lock(getMutex());
   if (OpenedHandles) {
-    for (std::vector<void *>::iterator I = OpenedHandles->begin(),
+    for (DenseSet<void *>::iterator I = OpenedHandles->begin(),
          E = OpenedHandles->end(); I != E; ++I) {
       //lt_ptr ptr = lt_dlsym(*I, symbolName);
       void *ptr = dlsym(*I, symbolName);
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index 1568342..17b8271 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -64,10 +64,8 @@ void FoldingSetNodeID::AddPointer(const void *Ptr) {
   // depend on the host.  It doesn't matter however, because hashing on
   // pointer values in inherently unstable.  Nothing  should depend on the 
   // ordering of nodes in the folding set.
-  intptr_t PtrI = (intptr_t)Ptr;
-  Bits.push_back(unsigned(PtrI));
-  if (sizeof(intptr_t) > sizeof(unsigned))
-    Bits.push_back(unsigned(uint64_t(PtrI) >> 32));
+  Bits.append(reinterpret_cast<unsigned *>(&Ptr),
+              reinterpret_cast<unsigned *>(&Ptr+1));
 }
 void FoldingSetNodeID::AddInteger(signed I) {
   Bits.push_back(I);
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index c525a12..a19e4b4 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -213,13 +213,13 @@ std::string sys::getHostCPUName() {
       case 30: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
                // As found in a Summer 2010 model iMac.
       case 37: // Intel Core i7, laptop version.
+      case 44: // Intel Core i7 processor and Intel Xeon processor. All
+               // processors are manufactured using the 32 nm process.
         return "corei7";
 
       // SandyBridge:
       case 42: // Intel Core i7 processor. All processors are manufactured
                // using the 32 nm process.
-      case 44: // Intel Core i7 processor and Intel Xeon processor. All
-               // processors are manufactured using the 32 nm process.
       case 45:
         return "corei7-avx";
 
diff --git a/lib/Support/IncludeFile.cpp b/lib/Support/IncludeFile.cpp
index 5da8826..e67acb3 100644
--- a/lib/Support/IncludeFile.cpp
+++ b/lib/Support/IncludeFile.cpp
@@ -1,4 +1,4 @@
-//===- lib/System/IncludeFile.cpp - Ensure Linking Of Implementation -----===//
+//===- lib/Support/IncludeFile.cpp - Ensure Linking Of Implementation -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Support/Memory.cpp b/lib/Support/Memory.cpp
index a9689b2..2a1642a 100644
--- a/lib/Support/Memory.cpp
+++ b/lib/Support/Memory.cpp
@@ -16,6 +16,10 @@
 #include "llvm/Support/Valgrind.h"
 #include "llvm/Config/config.h"
 
+#if defined(__mips__)
+#include <sys/cachectl.h>
+#endif
+
 namespace llvm {
 using namespace sys;
 }
@@ -66,6 +70,8 @@ void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
   char *Start = (char*) Addr;
   char *End = Start + Len;
   __clear_cache(Start, End);
+#  elif defined(__mips__)
+  cacheflush((char*)Addr, Len, BCACHE);
 #  endif
 
 #endif  // end apple
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index d264be9..0771af5 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -275,16 +275,16 @@ static bool shouldUseMmap(int FD,
 
 error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
                                      OwningPtr<MemoryBuffer> &result,
-                                     size_t FileSize, size_t MapSize,
-                                     off_t Offset,
+                                     uint64_t FileSize, uint64_t MapSize,
+                                     int64_t Offset,
                                      bool RequiresNullTerminator) {
   static int PageSize = sys::Process::GetPageSize();
 
   // Default is to map the full file.
-  if (MapSize == size_t(-1)) {
+  if (MapSize == uint64_t(-1)) {
     // If we don't know the file size, use fstat to find out.  fstat on an open
     // file descriptor is cheaper than stat on a random path.
-    if (FileSize == size_t(-1)) {
+    if (FileSize == uint64_t(-1)) {
       struct stat FileInfo;
       // TODO: This should use fstat64 when available.
       if (fstat(FD, &FileInfo) == -1) {
diff --git a/lib/Support/MemoryObject.cpp b/lib/Support/MemoryObject.cpp
index 91e3ecd..b20ab89 100644
--- a/lib/Support/MemoryObject.cpp
+++ b/lib/Support/MemoryObject.cpp
@@ -19,8 +19,11 @@ int MemoryObject::readBytes(uint64_t address,
                             uint64_t* copied) const {
   uint64_t current = address;
   uint64_t limit = getBase() + getExtent();
-  
-  while (current - address < size && current < limit) {
+
+  if (current + size > limit)
+    return -1;
+
+  while (current - address < size) {
     if (readByte(current, &buf[(current - address)]))
       return -1;
     
diff --git a/lib/Support/Mutex.cpp b/lib/Support/Mutex.cpp
index b408973..8874e94 100644
--- a/lib/Support/Mutex.cpp
+++ b/lib/Support/Mutex.cpp
@@ -152,6 +152,6 @@ MutexImpl::tryacquire()
 #elif defined( LLVM_ON_WIN32)
 #include "Windows/Mutex.inc"
 #else
-#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in System/Mutex.cpp
+#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in Support/Mutex.cpp
 #endif
 #endif
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 8fbaf2d..e5b7cd3 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -121,7 +121,7 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
         case 7: return Mach_O_DynamicLinker_FileType;
         case 8: return Mach_O_Bundle_FileType;
         case 9: return Mach_O_DynamicallyLinkedSharedLibStub_FileType;
-        case 10: break; // FIXME: MH_DSYM companion file with only debug.
+        case 10: return Mach_O_DSYMCompanion_FileType;
       }
       break;
     }
diff --git a/lib/Support/PathV2.cpp b/lib/Support/PathV2.cpp
index 896c94c..bebe442 100644
--- a/lib/Support/PathV2.cpp
+++ b/lib/Support/PathV2.cpp
@@ -490,6 +490,36 @@ bool is_separator(char value) {
   }
 }
 
+void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result) {
+  result.clear();
+  
+  // Check whether the temporary directory is specified by an environment
+  // variable.
+  const char *EnvironmentVariable;
+#ifdef LLVM_ON_WIN32
+  EnvironmentVariable = "TEMP";
+#else
+  EnvironmentVariable = "TMPDIR";
+#endif
+  if (char *RequestedDir = getenv(EnvironmentVariable)) {
+    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+    return;
+  }
+    
+  // Fall back to a system default.
+  const char *DefaultResult;
+#ifdef LLVM_ON_WIN32
+  (void)erasedOnReboot;
+  DefaultResult = "C:\\TEMP";
+#else
+  if (erasedOnReboot)
+    DefaultResult = "/tmp";
+  else
+    DefaultResult = "/var/tmp";
+#endif
+  result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
+}
+  
 bool has_root_name(const Twine &path) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
@@ -626,7 +656,7 @@ error_code create_directories(const Twine &path, bool &existed) {
   if (error_code ec = fs::exists(parent, parent_exists)) return ec;
 
   if (!parent_exists)
-    return create_directories(parent, existed);
+    if (error_code ec = create_directories(parent, existed)) return ec;
 
   return create_directory(p, existed);
 }
@@ -682,14 +712,12 @@ bool is_other(file_status status) {
          !is_symlink(status);
 }
 
-void directory_entry::replace_filename(const Twine &filename, file_status st,
-                                       file_status symlink_st) {
+void directory_entry::replace_filename(const Twine &filename, file_status st) {
   SmallString<128> path(Path.begin(), Path.end());
   path::remove_filename(path);
   path::append(path, filename);
   Path = path.str();
   Status = st;
-  SymlinkStatus = symlink_st;
 }
 
 error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 082b701..ef33073 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -62,7 +62,7 @@ extern "C" {
 CRASH_REPORTER_CLIENT_HIDDEN 
 struct crashreporter_annotations_t gCRAnnotations 
         __attribute__((section("__DATA," CRASHREPORTER_ANNOTATIONS_SECTION))) 
-        = { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0 };
+        = { CRASHREPORTER_ANNOTATIONS_VERSION, 0, 0, 0, 0, 0, 0 };
 }
 #elif defined (__APPLE__) && HAVE_CRASHREPORTER_INFO
 static const char *__crashreporter_info__ = 0;
diff --git a/lib/Support/RWMutex.cpp b/lib/Support/RWMutex.cpp
index fc02f9c..d0b1e10 100644
--- a/lib/Support/RWMutex.cpp
+++ b/lib/Support/RWMutex.cpp
@@ -152,6 +152,6 @@ RWMutexImpl::writer_release()
 #elif defined( LLVM_ON_WIN32)
 #include "Windows/RWMutex.inc"
 #else
-#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in System/Mutex.cpp
+#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in Support/Mutex.cpp
 #endif
 #endif
diff --git a/lib/Support/SearchForAddressOfSpecialSymbol.cpp b/lib/Support/SearchForAddressOfSpecialSymbol.cpp
index d638301..2d23902 100644
--- a/lib/Support/SearchForAddressOfSpecialSymbol.cpp
+++ b/lib/Support/SearchForAddressOfSpecialSymbol.cpp
@@ -28,21 +28,6 @@ static void *DoSearch(const char* symbolName) {
 
 #ifdef __APPLE__
   {
-    EXPLICIT_SYMBOL(__ashldi3);
-    EXPLICIT_SYMBOL(__ashrdi3);
-    EXPLICIT_SYMBOL(__cmpdi2);
-    EXPLICIT_SYMBOL(__divdi3);
-    EXPLICIT_SYMBOL(__fixdfdi);
-    EXPLICIT_SYMBOL(__fixsfdi);
-    EXPLICIT_SYMBOL(__fixunsdfdi);
-    EXPLICIT_SYMBOL(__fixunssfdi);
-    EXPLICIT_SYMBOL(__floatdidf);
-    EXPLICIT_SYMBOL(__floatdisf);
-    EXPLICIT_SYMBOL(__lshrdi3);
-    EXPLICIT_SYMBOL(__moddi3);
-    EXPLICIT_SYMBOL(__udivdi3);
-    EXPLICIT_SYMBOL(__umoddi3);
-
     // __eprintf is sometimes used for assert() handling on x86.
     //
     // FIXME: Currently disabled when using Clang, as we don't always have our
diff --git a/lib/Support/StringExtras.cpp b/lib/Support/StringExtras.cpp
index eb2fa08..49c5ac4 100644
--- a/lib/Support/StringExtras.cpp
+++ b/lib/Support/StringExtras.cpp
@@ -51,11 +51,10 @@ std::pair<StringRef, StringRef> llvm::getToken(StringRef Source,
 void llvm::SplitString(StringRef Source,
                        SmallVectorImpl<StringRef> &OutFragments,
                        StringRef Delimiters) {
-  StringRef S2, S;
-  tie(S2, S) = getToken(Source, Delimiters);
-  while (!S2.empty()) {
-    OutFragments.push_back(S2);
-    tie(S2, S) = getToken(S, Delimiters);
+  std::pair<StringRef, StringRef> S = getToken(Source, Delimiters);
+  while (!S.first.empty()) {
+    OutFragments.push_back(S.first);
+    S = getToken(S.second, Delimiters);
   }
 }
 
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index 8c3fc09..b5b4f94 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -46,12 +46,12 @@ int StringRef::compare_lower(StringRef RHS) const {
 /// compare_numeric - Compare strings, handle embedded numbers.
 int StringRef::compare_numeric(StringRef RHS) const {
   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
-    if (Data[I] == RHS.Data[I])
-      continue;
+    // Check for sequences of digits.
     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
-      // The longer sequence of numbers is larger. This doesn't really handle
-      // prefixed zeros well.
-      for (size_t J = I+1; J != E+1; ++J) {
+      // The longer sequence of numbers is considered larger.
+      // This doesn't really handle prefixed zeros well.
+      size_t J;
+      for (J = I + 1; J != E + 1; ++J) {
         bool ld = J < Length && ascii_isdigit(Data[J]);
         bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
         if (ld != rd)
@@ -59,8 +59,15 @@ int StringRef::compare_numeric(StringRef RHS) const {
         if (!rd)
           break;
       }
+      // The two number sequences have the same length (J-I), just memcmp them.
+      if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
+        return Res < 0 ? -1 : 1;
+      // Identical number sequences, continue search after the numbers.
+      I = J - 1;
+      continue;
     }
-    return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
+    if (Data[I] != RHS.Data[I])
+      return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
   }
   if (Length == RHS.Length)
     return 0;
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index 293a5d7..7497bfe 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -7,9 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cassert>
+#include <vector>
 using namespace llvm;
 
 // Clients are responsible for avoid race conditions in registration.
@@ -90,3 +94,29 @@ const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) {
   return TheTarget;
 }
 
+static int TargetArraySortFn(const void *LHS, const void *RHS) {
+  typedef std::pair<StringRef, const Target*> pair_ty;
+  return ((const pair_ty*)LHS)->first.compare(((const pair_ty*)RHS)->first);
+}
+
+void TargetRegistry::printRegisteredTargetsForVersion() {
+  std::vector<std::pair<StringRef, const Target*> > Targets;
+  size_t Width = 0;
+  for (TargetRegistry::iterator I = TargetRegistry::begin(),
+       E = TargetRegistry::end();
+       I != E; ++I) {
+    Targets.push_back(std::make_pair(I->getName(), &*I));
+    Width = std::max(Width, Targets.back().first.size());
+  }
+  array_pod_sort(Targets.begin(), Targets.end(), TargetArraySortFn);
+
+  raw_ostream &OS = outs();
+  OS << "  Registered Targets:\n";
+  for (unsigned i = 0, e = Targets.size(); i != e; ++i) {
+    OS << "    " << Targets[i].first;
+    OS.indent(Width - Targets[i].first.size()) << " - "
+      << Targets[i].second->getShortDescription() << '\n';
+  }
+  if (Targets.empty())
+    OS << "    (none)\n";
+}
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index 6b43048..fdb251c 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -79,6 +79,6 @@ void ThreadLocalImpl::removeInstance() {
 #elif defined( LLVM_ON_WIN32)
 #include "Windows/ThreadLocal.inc"
 #else
-#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 was set in System/ThreadLocal.cpp
+#warning Neither LLVM_ON_UNIX nor LLVM_ON_WIN32 set in Support/ThreadLocal.cpp
 #endif
 #endif
diff --git a/lib/Support/Threading.cpp b/lib/Support/Threading.cpp
index 2957956..8f0bb93 100644
--- a/lib/Support/Threading.cpp
+++ b/lib/Support/Threading.cpp
@@ -24,7 +24,7 @@ static bool multithreaded_mode = false;
 static sys::Mutex* global_lock = 0;
 
 bool llvm::llvm_start_multithreaded() {
-#ifdef LLVM_MULTITHREADED
+#if ENABLE_THREADS != 0
   assert(!multithreaded_mode && "Already multithreaded!");
   multithreaded_mode = true;
   global_lock = new sys::Mutex(true);
@@ -39,7 +39,7 @@ bool llvm::llvm_start_multithreaded() {
 }
 
 void llvm::llvm_stop_multithreaded() {
-#ifdef LLVM_MULTITHREADED
+#if ENABLE_THREADS != 0
   assert(multithreaded_mode && "Not currently multithreaded!");
 
   // We fence here to insure that all threaded operations are complete BEFORE we
@@ -63,7 +63,7 @@ void llvm::llvm_release_global_lock() {
   if (multithreaded_mode) global_lock->release();
 }
 
-#if defined(LLVM_MULTITHREADED) && defined(HAVE_PTHREAD_H)
+#if ENABLE_THREADS != 0 && defined(HAVE_PTHREAD_H)
 #include <pthread.h>
 
 struct ThreadInfo {
@@ -102,13 +102,44 @@ void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
  error:
   ::pthread_attr_destroy(&Attr);
 }
+#elif ENABLE_THREADS!=0 && defined(LLVM_ON_WIN32)
+#include "Windows/Windows.h"
+#include <process.h>
 
-#else
+struct ThreadInfo {
+  void (*func)(void*);
+  void *param;
+};
 
-// No non-pthread implementation, currently.
+static unsigned __stdcall ThreadCallback(void *param) {
+  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
+  info->func(info->param);
+
+  return 0;
+}
 
 void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
                                   unsigned RequestedStackSize) {
+  struct ThreadInfo param = { Fn, UserData };
+
+  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
+                                            RequestedStackSize, ThreadCallback,
+                                            &param, 0, NULL);
+
+  if (hThread) {
+    // We actually don't care whether the wait succeeds or fails, in
+    // the same way we don't care whether the pthread_join call succeeds
+    // or fails.  There's not much we could do if this were to fail. But
+    // on success, this call will wait until the thread finishes executing
+    // before returning.
+    (void)::WaitForSingleObject(hThread, INFINITE);
+    ::CloseHandle(hThread);
+  }
+}
+#else
+// Support for non-Win32, non-pthread implementation.
+void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
+                                  unsigned RequestedStackSize) {
   (void) RequestedStackSize;
   Fn(UserData);
 }
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 7e094ee..c61af37 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -8,16 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/Triple.h"
-
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Twine.h"
-#include <cassert>
 #include <cstring>
 using namespace llvm;
 
-//
-
 const char *Triple::getArchTypeName(ArchType Kind) {
   switch (Kind) {
   case InvalidArch: return "<invalid>";
@@ -29,6 +24,8 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case cellspu: return "cellspu";
   case mips:    return "mips";
   case mipsel:  return "mipsel";
+  case mips64:  return "mips64";
+  case mips64el:return "mips64el";
   case msp430:  return "msp430";
   case ppc64:   return "powerpc64";
   case ppc:     return "powerpc";
@@ -43,6 +40,8 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case mblaze:  return "mblaze";
   case ptx32:   return "ptx32";
   case ptx64:   return "ptx64";
+  case le32:    return "le32";
+  case amdil:   return "amdil";
   }
 
   return "<invalid>";
@@ -77,6 +76,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
 
   case ptx32:   return "ptx";
   case ptx64:   return "ptx";
+  case le32:    return "le32";
+  case amdil:   return "amdil";
   }
 }
 
@@ -102,6 +103,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case DragonFly: return "dragonfly";
   case FreeBSD: return "freebsd";
   case IOS: return "ios";
+  case KFreeBSD: return "kfreebsd";
   case Linux: return "linux";
   case Lv2: return "lv2";
   case MacOSX: return "macosx";
@@ -114,6 +116,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case Haiku: return "haiku";
   case Minix: return "minix";
   case RTEMS: return "rtems";
+  case NativeClient: return "nacl";
   }
 
   return "<invalid>";
@@ -144,10 +147,16 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     return mips;
   if (Name == "mipsel")
     return mipsel;
+  if (Name == "mips64")
+    return mips64;
+  if (Name == "mips64el")
+    return mips64el;
   if (Name == "msp430")
     return msp430;
   if (Name == "ppc64")
     return ppc64;
+  if (Name == "ppc32")
+    return ppc;
   if (Name == "ppc")
     return ppc;
   if (Name == "mblaze")
@@ -172,6 +181,10 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     return ptx32;
   if (Name == "ptx64")
     return ptx64;
+  if (Name == "le32")
+    return le32;
+  if (Name == "amdil")
+      return amdil;
 
   return UnknownArch;
 }
@@ -207,13 +220,16 @@ Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
 
   // This is derived from the driver driver.
   if (Str == "arm" || Str == "armv4t" || Str == "armv5" || Str == "xscale" ||
-      Str == "armv6" || Str == "armv7")
+      Str == "armv6" || Str == "armv7" || Str == "armv7f" || Str == "armv7k" ||
+      Str == "armv7s")
     return Triple::arm;
 
   if (Str == "ptx32")
     return Triple::ptx32;
   if (Str == "ptx64")
     return Triple::ptx64;
+  if (Str == "amdil")
+      return Triple::amdil;
 
   return Triple::UnknownArch;
 }
@@ -249,6 +265,10 @@ const char *Triple::getArchNameForAssembler() {
     return "ptx32";
   if (Str == "ptx64")
     return "ptx64";
+  if (Str == "le32")
+    return "le32";
+  if (Str == "amdil")
+      return "amdil";
   return NULL;
 }
 
@@ -288,6 +308,10 @@ Triple::ArchType Triple::ParseArch(StringRef ArchName) {
   else if (ArchName == "mipsel" || ArchName == "mipsallegrexel" ||
            ArchName == "psp")
     return mipsel;
+  else if (ArchName == "mips64" || ArchName == "mips64eb")
+    return mips64;
+  else if (ArchName == "mips64el")
+    return mips64el;
   else if (ArchName == "sparc")
     return sparc;
   else if (ArchName == "sparcv9")
@@ -302,6 +326,10 @@ Triple::ArchType Triple::ParseArch(StringRef ArchName) {
     return ptx32;
   else if (ArchName == "ptx64")
     return ptx64;
+  else if (ArchName == "le32")
+    return le32;
+  else if (ArchName == "amdil")
+      return amdil;
   else
     return UnknownArch;
 }
@@ -330,6 +358,8 @@ Triple::OSType Triple::ParseOS(StringRef OSName) {
     return FreeBSD;
   else if (OSName.startswith("ios"))
     return IOS;
+  else if (OSName.startswith("kfreebsd"))
+    return KFreeBSD;
   else if (OSName.startswith("linux"))
     return Linux;
   else if (OSName.startswith("lv2"))
@@ -354,6 +384,8 @@ Triple::OSType Triple::ParseOS(StringRef OSName) {
     return Minix;
   else if (OSName.startswith("rtems"))
     return RTEMS;
+  else if (OSName.startswith("nacl"))
+    return NativeClient;
   else
     return UnknownOS;
 }
diff --git a/lib/Support/Twine.cpp b/lib/Support/Twine.cpp
index d62123c..3d04bc3 100644
--- a/lib/Support/Twine.cpp
+++ b/lib/Support/Twine.cpp
@@ -16,7 +16,7 @@ using namespace llvm;
 std::string Twine::str() const {
   // If we're storing only a std::string, just return it.
   if (LHSKind == StdStringKind && RHSKind == EmptyKind)
-    return *static_cast<const std::string*>(LHS);
+    return *LHS.stdString;
 
   // Otherwise, flatten and copy the contents first.
   SmallString<256> Vec;
@@ -40,9 +40,9 @@ StringRef Twine::toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const {
     switch (getLHSKind()) {
     case CStringKind:
       // Already null terminated, yay!
-      return StringRef(static_cast<const char*>(LHS));
+      return StringRef(LHS.cString);
     case StdStringKind: {
-      const std::string *str = static_cast<const std::string*>(LHS);
+      const std::string *str = LHS.stdString;
       return StringRef(str->c_str(), str->size());
     }
     default:
@@ -55,48 +55,51 @@ StringRef Twine::toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const {
   return StringRef(Out.data(), Out.size());
 }
 
-void Twine::printOneChild(raw_ostream &OS, const void *Ptr,
+void Twine::printOneChild(raw_ostream &OS, Child Ptr,
                           NodeKind Kind) const {
   switch (Kind) {
   case Twine::NullKind: break;
   case Twine::EmptyKind: break;
   case Twine::TwineKind:
-    static_cast<const Twine*>(Ptr)->print(OS);
+    Ptr.twine->print(OS);
     break;
   case Twine::CStringKind:
-    OS << static_cast<const char*>(Ptr);
+    OS << Ptr.cString;
     break;
   case Twine::StdStringKind:
-    OS << *static_cast<const std::string*>(Ptr);
+    OS << *Ptr.stdString;
     break;
   case Twine::StringRefKind:
-    OS << *static_cast<const StringRef*>(Ptr);
+    OS << *Ptr.stringRef;
+    break;
+  case Twine::CharKind:
+    OS << Ptr.character;
     break;
   case Twine::DecUIKind:
-    OS << (unsigned)(uintptr_t)Ptr;
+    OS << Ptr.decUI;
     break;
   case Twine::DecIKind:
-    OS << (int)(intptr_t)Ptr;
+    OS << Ptr.decI;
     break;
   case Twine::DecULKind:
-    OS << *static_cast<const unsigned long*>(Ptr);
+    OS << *Ptr.decUL;
     break;
   case Twine::DecLKind:
-    OS << *static_cast<const long*>(Ptr);
+    OS << *Ptr.decL;
     break;
   case Twine::DecULLKind:
-    OS << *static_cast<const unsigned long long*>(Ptr);
+    OS << *Ptr.decULL;
     break;
   case Twine::DecLLKind:
-    OS << *static_cast<const long long*>(Ptr);
+    OS << *Ptr.decLL;
     break;
   case Twine::UHexKind:
-    OS.write_hex(*static_cast<const uint64_t*>(Ptr));
+    OS.write_hex(*Ptr.uHex);
     break;
   }
 }
 
-void Twine::printOneChildRepr(raw_ostream &OS, const void *Ptr,
+void Twine::printOneChildRepr(raw_ostream &OS, Child Ptr,
                               NodeKind Kind) const {
   switch (Kind) {
   case Twine::NullKind:
@@ -105,40 +108,43 @@ void Twine::printOneChildRepr(raw_ostream &OS, const void *Ptr,
     OS << "empty"; break;
   case Twine::TwineKind:
     OS << "rope:";
-    static_cast<const Twine*>(Ptr)->printRepr(OS);
+    Ptr.twine->printRepr(OS);
     break;
   case Twine::CStringKind:
     OS << "cstring:\""
-       << static_cast<const char*>(Ptr) << "\"";
+       << Ptr.cString << "\"";
     break;
   case Twine::StdStringKind:
     OS << "std::string:\""
-       << static_cast<const std::string*>(Ptr) << "\"";
+       << Ptr.stdString << "\"";
     break;
   case Twine::StringRefKind:
     OS << "stringref:\""
-       << static_cast<const StringRef*>(Ptr) << "\"";
+       << Ptr.stringRef << "\"";
+    break;
+  case Twine::CharKind:
+    OS << "char:\"" << Ptr.character << "\"";
     break;
   case Twine::DecUIKind:
-    OS << "decUI:\"" << (unsigned)(uintptr_t)Ptr << "\"";
+    OS << "decUI:\"" << Ptr.decUI << "\"";
     break;
   case Twine::DecIKind:
-    OS << "decI:\"" << (int)(intptr_t)Ptr << "\"";
+    OS << "decI:\"" << Ptr.decI << "\"";
     break;
   case Twine::DecULKind:
-    OS << "decUL:\"" << *static_cast<const unsigned long*>(Ptr) << "\"";
+    OS << "decUL:\"" << *Ptr.decUL << "\"";
     break;
   case Twine::DecLKind:
-    OS << "decL:\"" << *static_cast<const long*>(Ptr) << "\"";
+    OS << "decL:\"" << *Ptr.decL << "\"";
     break;
   case Twine::DecULLKind:
-    OS << "decULL:\"" << *static_cast<const unsigned long long*>(Ptr) << "\"";
+    OS << "decULL:\"" << *Ptr.decULL << "\"";
     break;
   case Twine::DecLLKind:
-    OS << "decLL:\"" << *static_cast<const long long*>(Ptr) << "\"";
+    OS << "decLL:\"" << *Ptr.decLL << "\"";
     break;
   case Twine::UHexKind:
-    OS << "uhex:\"" << static_cast<const uint64_t*>(Ptr) << "\"";
+    OS << "uhex:\"" << Ptr.uHex << "\"";
     break;
   }
 }
diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc
index 5fd0e5e..dda3ce2 100644
--- a/lib/Support/Unix/Host.inc
+++ b/lib/Support/Unix/Host.inc
@@ -22,6 +22,7 @@
 #include <sys/utsname.h>
 #include <cctype>
 #include <string>
+#include <cstdlib> // ::getenv
 
 using namespace llvm;
 
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index f295b92..85c7c40 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -252,8 +252,8 @@ Path::GetUserHomeDirectory() {
 Path
 Path::GetCurrentDirectory() {
   char pathname[MAXPATHLEN];
-  if (!getcwd(pathname,MAXPATHLEN)) {
-    assert (false && "Could not query current working directory.");
+  if (!getcwd(pathname, MAXPATHLEN)) {
+    assert(false && "Could not query current working directory.");
     return Path();
   }
 
diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc
index 03ff283..bbbc344 100644
--- a/lib/Support/Unix/PathV2.inc
+++ b/lib/Support/Unix/PathV2.inc
@@ -42,6 +42,9 @@
 #if HAVE_STDIO_H
 #include <stdio.h>
 #endif
+#if HAVE_LIMITS_H
+#include <limits.h>
+#endif
 
 using namespace llvm;
 
@@ -342,19 +345,22 @@ error_code status(const Twine &path, file_status &result) {
 }
 
 error_code unique_file(const Twine &model, int &result_fd,
-                             SmallVectorImpl<char> &result_path) {
+                             SmallVectorImpl<char> &result_path,
+                             bool makeAbsolute) {
   SmallString<128> Model;
   model.toVector(Model);
   // Null terminate.
   Model.c_str();
 
-  // Make model absolute by prepending a temp directory if it's not already.
-  bool absolute = path::is_absolute(Twine(Model));
-  if (!absolute) {
-    SmallString<128> TDir;
-    if (error_code ec = TempDir(TDir)) return ec;
-    path::append(TDir, Twine(Model));
-    Model.swap(TDir);
+  if (makeAbsolute) {
+    // Make model absolute by prepending a temp directory if it's not already.
+    bool absolute = path::is_absolute(Twine(Model));
+    if (!absolute) {
+      SmallString<128> TDir;
+      if (error_code ec = TempDir(TDir)) return ec;
+      path::append(TDir, Twine(Model));
+      Model.swap(TDir);
+    }
   }
 
   // Replace '%' with random chars. From here on, DO NOT modify model. It may be
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 5cdb11c..da440fd 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -293,3 +293,7 @@ const char *Process::OutputBold(bool bg) {
 const char *Process::ResetColor() {
   return "\033[0m";
 }
+
+void Process::SetWorkingDirectory(std::string Path) {
+  ::chdir(Path.c_str());
+}
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index fc5f580..83da82a 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -39,7 +39,7 @@ using namespace sys;
 //===          and must not be UNIX code.
 //===----------------------------------------------------------------------===//
 
-static std::vector<HMODULE> OpenedHandles;
+static DenseSet<HMODULE> *OpenedHandles;
 
 extern "C" {
 
@@ -63,30 +63,43 @@ extern "C" {
 #endif
         stricmp(ModuleName, "msvcrt20") != 0 &&
         stricmp(ModuleName, "msvcrt40") != 0) {
-      OpenedHandles.push_back((HMODULE)ModuleBase);
+      OpenedHandles->insert((HMODULE)ModuleBase);
     }
     return TRUE;
   }
 }
 
-bool DynamicLibrary::LoadLibraryPermanently(const char *filename,
-                                            std::string *ErrMsg) {
-  if (filename) {
-    HMODULE a_handle = LoadLibrary(filename);
+DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
+                                                   std::string *errMsg) {
+  SmartScopedLock<true> lock(getMutex());
 
-    if (a_handle == 0)
-      return MakeErrMsg(ErrMsg, std::string(filename) + ": Can't open : ");
+  if (!filename) {
+    // When no file is specified, enumerate all DLLs and EXEs in the process.
+    if (OpenedHandles == 0)
+      OpenedHandles = new DenseSet<HMODULE>();
 
-    OpenedHandles.push_back(a_handle);
-  } else {
-    // When no file is specified, enumerate all DLLs and EXEs in the
-    // process.
     EnumerateLoadedModules(GetCurrentProcess(), ELM_Callback, 0);
+    // Dummy library that represents "search all handles".
+    // This is mostly to ensure that the return value still shows up as "valid".
+    return DynamicLibrary(&OpenedHandles);
   }
+  
+  HMODULE a_handle = LoadLibrary(filename);
 
-  // Because we don't remember the handle, we will never free it; hence,
-  // it is loaded permanently.
-  return false;
+  if (a_handle == 0) {
+    MakeErrMsg(errMsg, std::string(filename) + ": Can't open : ");
+    return DynamicLibrary();
+  }
+
+  if (OpenedHandles == 0)
+    OpenedHandles = new DenseSet<HMODULE>();
+
+  // If we've already loaded this library, FreeLibrary() the handle in order to
+  // keep the internal refcount at +1.
+  if (!OpenedHandles->insert(a_handle).second)
+    FreeLibrary(a_handle);
+
+  return DynamicLibrary(a_handle);
 }
 
 // Stack probing routines are in the support library (e.g. libgcc), but we don't
@@ -101,21 +114,24 @@ bool DynamicLibrary::LoadLibraryPermanently(const char *filename,
 #undef EXPLICIT_SYMBOL2
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
+  SmartScopedLock<true> Lock(getMutex());
+
   // First check symbols added via AddSymbol().
   if (ExplicitSymbols) {
-    std::map<std::string, void *>::iterator I =
-      ExplicitSymbols->find(symbolName);
-    std::map<std::string, void *>::iterator E = ExplicitSymbols->end();
-    if (I != E)
-      return I->second;
+    StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
+
+    if (i != ExplicitSymbols->end())
+      return i->second;
   }
 
   // Now search the libraries.
-  for (std::vector<HMODULE>::iterator I = OpenedHandles.begin(),
-       E = OpenedHandles.end(); I != E; ++I) {
-    FARPROC ptr = GetProcAddress((HMODULE)*I, symbolName);
-    if (ptr) {
-      return (void *)(intptr_t)ptr;
+  if (OpenedHandles) {
+    for (DenseSet<HMODULE>::iterator I = OpenedHandles->begin(),
+         E = OpenedHandles->end(); I != E; ++I) {
+      FARPROC ptr = GetProcAddress((HMODULE)*I, symbolName);
+      if (ptr) {
+        return (void *)(intptr_t)ptr;
+      }
     }
   }
 
@@ -134,4 +150,14 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
   return 0;
 }
 
+
+void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+  if (!isValid())
+    return NULL;
+  if (Data == &OpenedHandles)
+    return SearchForAddressOfSymbol(symbolName);
+  return (void *)(intptr_t)GetProcAddress((HMODULE)Data, symbolName);
+}
+
+
 }
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index 9f69e73..fcc7283 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -32,11 +32,16 @@ MemoryBlock Memory::AllocateRWX(size_t NumBytes,
   static const size_t pageSize = Process::GetPageSize();
   size_t NumPages = (NumBytes+pageSize-1)/pageSize;
 
-  //FIXME: support NearBlock if ever needed on Win64.
+  PVOID start = NearBlock ? static_cast<unsigned char *>(NearBlock->base()) +
+                                NearBlock->size() : NULL;
 
-  void *pa = VirtualAlloc(NULL, NumPages*pageSize, MEM_COMMIT,
+  void *pa = VirtualAlloc(start, NumPages*pageSize, MEM_RESERVE | MEM_COMMIT,
                   PAGE_EXECUTE_READWRITE);
   if (pa == NULL) {
+    if (NearBlock) {
+      // Try again without the NearBlock hint
+      return AllocateRWX(NumBytes, NULL, ErrMsg);
+    }
     MakeErrMsg(ErrMsg, "Can't allocate RWX Memory: ");
     return MemoryBlock();
   }
@@ -54,20 +59,62 @@ bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
   return false;
 }
 
+static DWORD getProtection(const void *addr) {
+  MEMORY_BASIC_INFORMATION info;
+  if (sizeof(info) == ::VirtualQuery(addr, &info, sizeof(info))) {
+    return info.Protect;
+  }
+  return 0;
+}
+
 bool Memory::setWritable(MemoryBlock &M, std::string *ErrMsg) {
+  if (!setRangeWritable(M.Address, M.Size)) {
+    return MakeErrMsg(ErrMsg, "Cannot set memory to writeable: ");
+  }
   return true;
 }
 
 bool Memory::setExecutable(MemoryBlock &M, std::string *ErrMsg) {
-  return false;
+  if (!setRangeExecutable(M.Address, M.Size)) {
+    return MakeErrMsg(ErrMsg, "Cannot set memory to executable: ");
+  }
+  return true;
 }
 
 bool Memory::setRangeWritable(const void *Addr, size_t Size) {
-  return true;
+  DWORD prot = getProtection(Addr);
+  if (!prot)
+    return false;
+
+  if (prot == PAGE_EXECUTE || prot == PAGE_EXECUTE_READ) {
+    prot = PAGE_EXECUTE_READWRITE;
+  } else if (prot == PAGE_NOACCESS || prot == PAGE_READONLY) {
+    prot = PAGE_READWRITE;
+  }
+
+  DWORD oldProt;
+  sys::Memory::InvalidateInstructionCache(Addr, Size);
+  return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
+            == TRUE;
 }
 
 bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
-  return false;
+  DWORD prot = getProtection(Addr);
+  if (!prot)
+    return false;
+
+  if (prot == PAGE_NOACCESS) {
+    prot = PAGE_EXECUTE;
+  } else if (prot == PAGE_READONLY) {
+    prot = PAGE_EXECUTE_READ;
+  } else if (prot == PAGE_READWRITE) {
+    prot = PAGE_EXECUTE_READWRITE;
+  }
+
+  DWORD oldProt;
+  sys::Memory::InvalidateInstructionCache(Addr, Size);
+  return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
+            == TRUE;
 }
 
 }
diff --git a/lib/Support/Windows/PathV2.inc b/lib/Support/Windows/PathV2.inc
index af71b73..bc597b2 100644
--- a/lib/Support/Windows/PathV2.inc
+++ b/lib/Support/Windows/PathV2.inc
@@ -445,13 +445,35 @@ error_code file_size(const Twine &path, uint64_t &result) {
   return success;
 }
 
+static bool isReservedName(StringRef path) {
+  // This list of reserved names comes from MSDN, at:
+  // http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx
+  static const char *sReservedNames[] = { "nul", "con", "prn", "aux",
+                              "com1", "com2", "com3", "com4", "com5", "com6",
+                              "com7", "com8", "com9", "lpt1", "lpt2", "lpt3",
+                              "lpt4", "lpt5", "lpt6", "lpt7", "lpt8", "lpt9" };
+
+  // First, check to see if this is a device namespace, which always
+  // starts with \\.\, since device namespaces are not legal file paths.
+  if (path.startswith("\\\\.\\"))
+    return true;
+
+  // Then compare against the list of ancient reserved names
+  for (size_t i = 0; i < sizeof(sReservedNames) / sizeof(const char *); ++i) {
+    if (path.equals_lower(sReservedNames[i]))
+      return true;
+  }
+
+  // The path isn't what we consider reserved.
+  return false;
+}
+
 error_code status(const Twine &path, file_status &result) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
   StringRef path8 = path.toStringRef(path_storage);
-  // FIXME: We should detect as many "special file name" as possible.
-  if (path8.compare_lower("nul") == 0) {
+  if (isReservedName(path8)) {
     result = file_status(file_type::character_file);
     return success;
   }
@@ -501,7 +523,8 @@ handle_status_error:
 }
 
 error_code unique_file(const Twine &model, int &result_fd,
-                             SmallVectorImpl<char> &result_path) {
+                             SmallVectorImpl<char> &result_path,
+                             bool makeAbsolute) {
   // Use result_path as temp storage.
   result_path.set_size(0);
   StringRef m = model.toStringRef(result_path);
@@ -509,17 +532,19 @@ error_code unique_file(const Twine &model, int &result_fd,
   SmallVector<wchar_t, 128> model_utf16;
   if (error_code ec = UTF8ToUTF16(m, model_utf16)) return ec;
 
-  // Make model absolute by prepending a temp directory if it's not already.
-  bool absolute = path::is_absolute(m);
-
-  if (!absolute) {
-    SmallVector<wchar_t, 64> temp_dir;
-    if (error_code ec = TempDir(temp_dir)) return ec;
-    // Handle c: by removing it.
-    if (model_utf16.size() > 2 && model_utf16[1] == L':') {
-      model_utf16.erase(model_utf16.begin(), model_utf16.begin() + 2);
+  if (makeAbsolute) {
+    // Make model absolute by prepending a temp directory if it's not already.
+    bool absolute = path::is_absolute(m);
+  
+    if (!absolute) {
+      SmallVector<wchar_t, 64> temp_dir;
+      if (error_code ec = TempDir(temp_dir)) return ec;
+      // Handle c: by removing it.
+      if (model_utf16.size() > 2 && model_utf16[1] == L':') {
+        model_utf16.erase(model_utf16.begin(), model_utf16.begin() + 2);
+      }
+      model_utf16.insert(model_utf16.begin(), temp_dir.begin(), temp_dir.end());
     }
-    model_utf16.insert(model_utf16.begin(), temp_dir.begin(), temp_dir.end());
   }
 
   // Replace '%' with random chars. From here on, DO NOT modify model. It may be
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 06a7f00..fe54eb1 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -15,6 +15,7 @@
 #include <psapi.h>
 #include <malloc.h>
 #include <io.h>
+#include <direct.h>
 
 #ifdef __MINGW32__
  #if (HAVE_LIBPSAPI != 1)
@@ -219,4 +220,8 @@ const char *Process::ResetColor() {
   return 0;
 }
 
+void Process::SetWorkingDirectory(std::string Path) {
+  ::_chdir(Path.c_str());
+}
+
 }
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 471f8fa..26b9bba 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -18,39 +18,115 @@
 
 #include "Windows.h"
 
-// FIXME: Windows does not have reader-writer locks pre-Vista.  If you want
-// real reader-writer locks, you a threads implementation for Windows.
-
 namespace llvm {
 using namespace sys;
 
+// Windows has slim read-writer lock support on Vista and higher, so we
+// will attempt to load the APIs.  If they exist, we will use them, and
+// if not, we will fall back on critical sections.  When we drop support
+// for XP, we can stop lazy-loading these APIs and just use them directly.
+#if defined(__MINGW32__)
+  // Taken from WinNT.h
+  typedef struct _RTL_SRWLOCK {
+    PVOID Ptr;
+  } RTL_SRWLOCK, *PRTL_SRWLOCK;
+
+  // Taken from WinBase.h
+  typedef RTL_SRWLOCK SRWLOCK, *PSRWLOCK;
+#endif
+
+static VOID (WINAPI *fpInitializeSRWLock)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpAcquireSRWLockExclusive)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpAcquireSRWLockShared)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpReleaseSRWLockExclusive)(PSRWLOCK lock) = NULL;
+static VOID (WINAPI *fpReleaseSRWLockShared)(PSRWLOCK lock) = NULL;
+
+static bool sHasSRW = false;
+
+static bool loadSRW() {
+  static bool sChecked = false;
+  if (!sChecked) {
+    sChecked = true;
+
+    HMODULE hLib = ::LoadLibrary(TEXT("Kernel32"));
+    if (hLib) {
+      fpInitializeSRWLock =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "InitializeSRWLock");
+      fpAcquireSRWLockExclusive =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "AcquireSRWLockExclusive");
+      fpAcquireSRWLockShared =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "AcquireSRWLockShared");
+      fpReleaseSRWLockExclusive =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "ReleaseSRWLockExclusive");
+      fpReleaseSRWLockShared =
+        (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
+                                               "ReleaseSRWLockShared");
+      ::FreeLibrary(hLib);
+
+	  if (fpInitializeSRWLock != NULL) {
+	    sHasSRW = true;
+	  }
+    }
+  }
+  return sHasSRW;
+}
+
 RWMutexImpl::RWMutexImpl() {
-  data_ = calloc(1, sizeof(CRITICAL_SECTION));
-  InitializeCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  if (loadSRW()) {
+    data_ = calloc(1, sizeof(SRWLOCK));
+    fpInitializeSRWLock(static_cast<PSRWLOCK>(data_));
+  } else {
+    data_ = calloc(1, sizeof(CRITICAL_SECTION));
+    InitializeCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
 }
 
 RWMutexImpl::~RWMutexImpl() {
-  DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
-  free(data_);
+  if (sHasSRW) {
+    // Nothing to do in the case of slim reader/writers
+  } else {
+    DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+    free(data_);
+  }
 }
 
 bool RWMutexImpl::reader_acquire() {
-  EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  if (sHasSRW) {
+    fpAcquireSRWLockShared(static_cast<PSRWLOCK>(data_));
+  } else {
+    EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
   return true;
 }
 
 bool RWMutexImpl::reader_release() {
-  LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  if (sHasSRW) {
+    fpReleaseSRWLockShared(static_cast<PSRWLOCK>(data_));
+  } else {
+    LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
   return true;
 }
 
 bool RWMutexImpl::writer_acquire() {
-  EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  if (sHasSRW) {
+    fpAcquireSRWLockExclusive(static_cast<PSRWLOCK>(data_));
+  } else {
+    EnterCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
   return true;
 }
 
 bool RWMutexImpl::writer_release() {
-  LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  if (sHasSRW) {
+    fpReleaseSRWLockExclusive(static_cast<PSRWLOCK>(data_));
+  } else {
+    LeaveCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
+  }
   return true;
 }
 
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index 14f3f21..0d4b8a2 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -23,14 +23,133 @@
 #endif
 #include <psapi.h>
 
-#ifdef __MINGW32__
+#ifdef _MSC_VER
+ #pragma comment(lib, "psapi.lib")
+ #pragma comment(lib, "dbghelp.lib")
+#elif __MINGW32__
  #if ((HAVE_LIBIMAGEHLP != 1) || (HAVE_LIBPSAPI != 1))
   #error "libimagehlp.a & libpsapi.a should be present"
  #endif
-#else
- #pragma comment(lib, "psapi.lib")
- #pragma comment(lib, "dbghelp.lib")
-#endif
+ // The version of g++ that comes with MinGW does *not* properly understand
+ // the ll format specifier for printf. However, MinGW passes the format
+ // specifiers on to the MSVCRT entirely, and the CRT understands the ll
+ // specifier. So these warnings are spurious in this case. Since we compile
+ // with -Wall, this will generate these warnings which should be ignored. So
+ // we will turn off the warnings for this just file. However, MinGW also does
+ // not support push and pop for diagnostics, so we have to manually turn it
+ // back on at the end of the file.
+ #pragma GCC diagnostic ignored "-Wformat"
+ #pragma GCC diagnostic ignored "-Wformat-extra-args"
+
+ #if !defined(__MINGW64_VERSION_MAJOR)
+ // MinGW.org does not have updated support for the 64-bit versions of the
+ // DebugHlp APIs. So we will have to load them manually. The structures and
+ // method signatures were pulled from DbgHelp.h in the Windows Platform SDK,
+ // and adjusted for brevity.
+ typedef struct _IMAGEHLP_LINE64 {
+   DWORD    SizeOfStruct;
+   PVOID    Key;
+   DWORD    LineNumber;
+   PCHAR    FileName;
+   DWORD64  Address;
+ } IMAGEHLP_LINE64, *PIMAGEHLP_LINE64;
+
+ typedef struct _IMAGEHLP_SYMBOL64 {
+   DWORD   SizeOfStruct;
+   DWORD64 Address;
+   DWORD   Size;
+   DWORD   Flags;
+   DWORD   MaxNameLength;
+   CHAR    Name[1];
+ } IMAGEHLP_SYMBOL64, *PIMAGEHLP_SYMBOL64;
+
+ typedef struct _tagADDRESS64 {
+   DWORD64       Offset;
+   WORD          Segment;
+   ADDRESS_MODE  Mode;
+ } ADDRESS64, *LPADDRESS64;
+
+ typedef struct _KDHELP64 {
+   DWORD64   Thread;
+   DWORD   ThCallbackStack;
+   DWORD   ThCallbackBStore;
+   DWORD   NextCallback;
+   DWORD   FramePointer;
+   DWORD64   KiCallUserMode;
+   DWORD64   KeUserCallbackDispatcher;
+   DWORD64   SystemRangeStart;
+   DWORD64   KiUserExceptionDispatcher;
+   DWORD64   StackBase;
+   DWORD64   StackLimit;
+   DWORD64   Reserved[5];
+ } KDHELP64, *PKDHELP64;
+
+ typedef struct _tagSTACKFRAME64 {
+   ADDRESS64   AddrPC;
+   ADDRESS64   AddrReturn;
+   ADDRESS64   AddrFrame;
+   ADDRESS64   AddrStack;
+   ADDRESS64   AddrBStore;
+   PVOID       FuncTableEntry;
+   DWORD64     Params[4];
+   BOOL        Far;
+   BOOL        Virtual;
+   DWORD64     Reserved[3];
+   KDHELP64    KdHelp;
+ } STACKFRAME64, *LPSTACKFRAME64;
+
+typedef BOOL (__stdcall *PREAD_PROCESS_MEMORY_ROUTINE64)(HANDLE hProcess,
+                      DWORD64 qwBaseAddress, PVOID lpBuffer, DWORD nSize,
+                      LPDWORD lpNumberOfBytesRead);
+
+typedef PVOID (__stdcall *PFUNCTION_TABLE_ACCESS_ROUTINE64)( HANDLE ahProcess,
+                      DWORD64 AddrBase);
+
+typedef DWORD64 (__stdcall *PGET_MODULE_BASE_ROUTINE64)(HANDLE hProcess,
+                      DWORD64 Address);
+
+typedef DWORD64 (__stdcall *PTRANSLATE_ADDRESS_ROUTINE64)(HANDLE hProcess,
+                      HANDLE hThread, LPADDRESS64 lpaddr);
+
+typedef BOOL (WINAPI *fpStackWalk64)(DWORD, HANDLE, HANDLE, LPSTACKFRAME64,
+                      PVOID, PREAD_PROCESS_MEMORY_ROUTINE64,
+                      PFUNCTION_TABLE_ACCESS_ROUTINE64,
+                      PGET_MODULE_BASE_ROUTINE64,
+                      PTRANSLATE_ADDRESS_ROUTINE64);
+static fpStackWalk64 StackWalk64;
+
+typedef DWORD64 (WINAPI *fpSymGetModuleBase64)(HANDLE, DWORD64);
+static fpSymGetModuleBase64 SymGetModuleBase64;
+
+typedef BOOL (WINAPI *fpSymGetSymFromAddr64)(HANDLE, DWORD64,
+                      PDWORD64, PIMAGEHLP_SYMBOL64);
+static fpSymGetSymFromAddr64 SymGetSymFromAddr64;
+
+typedef BOOL (WINAPI *fpSymGetLineFromAddr64)(HANDLE, DWORD64,
+                      PDWORD, PIMAGEHLP_LINE64);
+static fpSymGetLineFromAddr64 SymGetLineFromAddr64;
+
+typedef PVOID (WINAPI *fpSymFunctionTableAccess64)(HANDLE, DWORD64);
+static fpSymFunctionTableAccess64 SymFunctionTableAccess64;
+
+static bool load64BitDebugHelp(void) {
+  HMODULE hLib = ::LoadLibrary("Dbghelp.dll");
+  if (hLib) {
+    StackWalk64 = (fpStackWalk64)
+                      ::GetProcAddress(hLib, "StackWalk64");
+    SymGetModuleBase64 = (fpSymGetModuleBase64)
+                      ::GetProcAddress(hLib, "SymGetModuleBase64");
+    SymGetSymFromAddr64 = (fpSymGetSymFromAddr64)
+                      ::GetProcAddress(hLib, "SymGetSymFromAddr64");
+    SymGetLineFromAddr64 = (fpSymGetLineFromAddr64)
+                      ::GetProcAddress(hLib, "SymGetLineFromAddr64");
+    SymFunctionTableAccess64 = (fpSymFunctionTableAccess64)
+                     ::GetProcAddress(hLib, "SymFunctionTableAccess64");
+  }
+  return StackWalk64 != NULL;
+}
+ #endif // !defined(__MINGW64_VERSION_MAJOR)
+#endif // __MINGW32__
 
 // Forward declare.
 static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep);
@@ -90,6 +209,18 @@ static int CRTReportHook(int ReportType, char *Message, int *Return) {
 #endif
 
 static void RegisterHandler() {
+#if __MINGW32__ && !defined(__MINGW64_VERSION_MAJOR)
+  // On MinGW.org, we need to load up the symbols explicitly, because the
+  // Win32 framework they include does not have support for the 64-bit
+  // versions of the APIs we need.  If we cannot load up the APIs (which
+  // would be unexpected as they should exist on every version of Windows
+  // we support), we will bail out since there would be nothing to report.
+  if (!load64BitDebugHelp()) {
+    assert(false && "These APIs should always be available");
+    return;
+  }
+#endif
+
   if (RegisteredUnhandledExceptionFilter) {
     EnterCriticalSection(&CriticalSection);
     return;
@@ -213,20 +344,28 @@ void llvm::sys::RunInterruptHandlers() {
 static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   Cleanup();
 
-#ifdef _WIN64
-  // TODO: provide a x64 friendly version of the following
-#else
-
   // Initialize the STACKFRAME structure.
-  STACKFRAME StackFrame;
+  STACKFRAME64 StackFrame;
   memset(&StackFrame, 0, sizeof(StackFrame));
 
+  DWORD machineType;
+#if defined(_M_X64)
+  machineType = IMAGE_FILE_MACHINE_AMD64;
+  StackFrame.AddrPC.Offset = ep->ContextRecord->Rip;
+  StackFrame.AddrPC.Mode = AddrModeFlat;
+  StackFrame.AddrStack.Offset = ep->ContextRecord->Rsp;
+  StackFrame.AddrStack.Mode = AddrModeFlat;
+  StackFrame.AddrFrame.Offset = ep->ContextRecord->Rbp;
+  StackFrame.AddrFrame.Mode = AddrModeFlat;
+#elif defined(_M_IX86)
+  machineType = IMAGE_FILE_MACHINE_I386;
   StackFrame.AddrPC.Offset = ep->ContextRecord->Eip;
   StackFrame.AddrPC.Mode = AddrModeFlat;
   StackFrame.AddrStack.Offset = ep->ContextRecord->Esp;
   StackFrame.AddrStack.Mode = AddrModeFlat;
   StackFrame.AddrFrame.Offset = ep->ContextRecord->Ebp;
   StackFrame.AddrFrame.Mode = AddrModeFlat;
+#endif
 
   HANDLE hProcess = GetCurrentProcess();
   HANDLE hThread = GetCurrentThread();
@@ -236,9 +375,9 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   SymInitialize(hProcess, NULL, TRUE);
 
   while (true) {
-    if (!StackWalk(IMAGE_FILE_MACHINE_I386, hProcess, hThread, &StackFrame,
-                   ep->ContextRecord, NULL, SymFunctionTableAccess,
-                   SymGetModuleBase, NULL)) {
+    if (!StackWalk64(machineType, hProcess, hThread, &StackFrame,
+                   ep->ContextRecord, NULL, SymFunctionTableAccess64,
+                   SymGetModuleBase64, NULL)) {
       break;
     }
 
@@ -246,54 +385,66 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
       break;
 
     // Print the PC in hexadecimal.
-    DWORD PC = StackFrame.AddrPC.Offset;
-    fprintf(stderr, "%08lX", PC);
+    DWORD64 PC = StackFrame.AddrPC.Offset;
+#if defined(_M_X64)
+    fprintf(stderr, "0x%016llX", PC);
+#elif defined(_M_IX86)
+    fprintf(stderr, "0x%08lX", static_cast<DWORD>(PC));
+#endif
 
     // Print the parameters.  Assume there are four.
+#if defined(_M_X64)
+    fprintf(stderr, " (0x%016llX 0x%016llX 0x%016llX 0x%016llX)",
+                StackFrame.Params[0],
+                StackFrame.Params[1],
+                StackFrame.Params[2],
+                StackFrame.Params[3]);
+#elif defined(_M_IX86)
     fprintf(stderr, " (0x%08lX 0x%08lX 0x%08lX 0x%08lX)",
-            StackFrame.Params[0],
-            StackFrame.Params[1], StackFrame.Params[2], StackFrame.Params[3]);
-
+                static_cast<DWORD>(StackFrame.Params[0]),
+                static_cast<DWORD>(StackFrame.Params[1]),
+                static_cast<DWORD>(StackFrame.Params[2]),
+                static_cast<DWORD>(StackFrame.Params[3]));
+#endif
     // Verify the PC belongs to a module in this process.
-    if (!SymGetModuleBase(hProcess, PC)) {
+    if (!SymGetModuleBase64(hProcess, PC)) {
       fputs(" <unknown module>\n", stderr);
       continue;
     }
 
     // Print the symbol name.
     char buffer[512];
-    IMAGEHLP_SYMBOL *symbol = reinterpret_cast<IMAGEHLP_SYMBOL *>(buffer);
-    memset(symbol, 0, sizeof(IMAGEHLP_SYMBOL));
-    symbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL);
-    symbol->MaxNameLength = 512 - sizeof(IMAGEHLP_SYMBOL);
+    IMAGEHLP_SYMBOL64 *symbol = reinterpret_cast<IMAGEHLP_SYMBOL64 *>(buffer);
+    memset(symbol, 0, sizeof(IMAGEHLP_SYMBOL64));
+    symbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL64);
+    symbol->MaxNameLength = 512 - sizeof(IMAGEHLP_SYMBOL64);
 
-    DWORD dwDisp;
-    if (!SymGetSymFromAddr(hProcess, PC, &dwDisp, symbol)) {
+    DWORD64 dwDisp;
+    if (!SymGetSymFromAddr64(hProcess, PC, &dwDisp, symbol)) {
       fputc('\n', stderr);
       continue;
     }
 
     buffer[511] = 0;
     if (dwDisp > 0)
-      fprintf(stderr, ", %s()+%04lu bytes(s)", symbol->Name, dwDisp);
+      fprintf(stderr, ", %s() + 0x%llX bytes(s)", symbol->Name, dwDisp);
     else
       fprintf(stderr, ", %s", symbol->Name);
 
     // Print the source file and line number information.
-    IMAGEHLP_LINE line;
+    IMAGEHLP_LINE64 line;
+    DWORD dwLineDisp;
     memset(&line, 0, sizeof(line));
     line.SizeOfStruct = sizeof(line);
-    if (SymGetLineFromAddr(hProcess, PC, &dwDisp, &line)) {
+    if (SymGetLineFromAddr64(hProcess, PC, &dwLineDisp, &line)) {
       fprintf(stderr, ", %s, line %lu", line.FileName, line.LineNumber);
-      if (dwDisp > 0)
-        fprintf(stderr, "+%04lu byte(s)", dwDisp);
+      if (dwLineDisp > 0)
+        fprintf(stderr, " + 0x%lX byte(s)", dwLineDisp);
     }
 
     fputc('\n', stderr);
   }
 
-#endif
-
   if (ExitOnUnhandledExceptions)
     _exit(-3);
 
@@ -326,3 +477,12 @@ static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType) {
   LeaveCriticalSection(&CriticalSection);
   return FALSE;
 }
+
+#if __MINGW32__
+ // We turned these warnings off for this file so that MinGW-g++ doesn't
+ // complain about the ll format specifiers used.  Now we are turning the
+ // warnings back on.  If MinGW starts to support diagnostic stacks, we can
+ // replace this with a pop.
+ #pragma GCC diagnostic warning "-Wformat"
+ #pragma GCC diagnostic warning "-Wformat-extra-args"
+#endif
diff --git a/lib/Support/Windows/Windows.h b/lib/Support/Windows/Windows.h
index 4a1553b..67b6f01 100644
--- a/lib/Support/Windows/Windows.h
+++ b/lib/Support/Windows/Windows.h
@@ -19,9 +19,9 @@
 // mingw-w64 tends to define it as 0x0502 in its headers.
 #undef _WIN32_WINNT
 
-// Require at least Windows 2000 API.
-#define _WIN32_WINNT 0x0500
-#define _WIN32_IE    0x0500 // MinGW at it again.
+// Require at least Windows XP(5.1) API.
+#define _WIN32_WINNT 0x0501
+#define _WIN32_IE    0x0600 // MinGW at it again.
 #define WIN32_LEAN_AND_MEAN
 
 #include "llvm/Config/config.h" // Get build system configuration settings
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index 5a71fa3..4927e9a 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -84,7 +84,7 @@ void raw_ostream::SetBuffered() {
 }
 
 void raw_ostream::SetBufferAndMode(char *BufferStart, size_t Size,
-                                    BufferKind Mode) {
+                                   BufferKind Mode) {
   assert(((Mode == Unbuffered && BufferStart == 0 && Size == 0) ||
           (Mode != Unbuffered && BufferStart && Size)) &&
          "stream must be unbuffered or have at least one byte");
@@ -121,7 +121,8 @@ raw_ostream &raw_ostream::operator<<(unsigned long N) {
 raw_ostream &raw_ostream::operator<<(long N) {
   if (N <  0) {
     *this << '-';
-    N = -N;
+    // Avoid undefined behavior on LONG_MIN with a cast.
+    N = -(unsigned long)N;
   }
 
   return this->operator<<(static_cast<unsigned long>(N));
@@ -284,7 +285,7 @@ raw_ostream &raw_ostream::write(unsigned char C) {
 
 raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
   // Group exceptional cases into a single branch.
-  if (BUILTIN_EXPECT(OutBufCur+Size > OutBufEnd, false)) {
+  if (BUILTIN_EXPECT(size_t(OutBufEnd - OutBufCur) < Size, false)) {
     if (BUILTIN_EXPECT(!OutBufStart, false)) {
       if (BufferMode == Unbuffered) {
         write_impl(Ptr, Size);
diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt
new file mode 100644
index 0000000..0db4134
--- /dev/null
+++ b/lib/TableGen/CMakeLists.txt
@@ -0,0 +1,16 @@
+## FIXME: This only requires RTTI because tblgen uses it.  Fix that.
+set(LLVM_REQUIRES_RTTI 1)
+set(LLVM_REQUIRES_EH 1)
+
+add_llvm_library(LLVMTableGen
+  Error.cpp
+  Main.cpp
+  Record.cpp
+  TableGenBackend.cpp
+  TGLexer.cpp
+  TGParser.cpp
+  )
+
+add_llvm_library_dependencies(LLVMTableGen
+  LLVMSupport
+  )
diff --git a/lib/TableGen/Error.cpp b/lib/TableGen/Error.cpp
new file mode 100644
index 0000000..5b2cbbf
--- /dev/null
+++ b/lib/TableGen/Error.cpp
@@ -0,0 +1,39 @@
+//===- Error.cpp - tblgen error handling helper routines --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains error handling helper routines to pretty-print diagnostic
+// messages from tblgen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/Error.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+SourceMgr SrcMgr;
+
+void PrintError(SMLoc ErrorLoc, const Twine &Msg) {
+  SrcMgr.PrintMessage(ErrorLoc, Msg, "error");
+}
+
+void PrintError(const char *Loc, const Twine &Msg) {
+  SrcMgr.PrintMessage(SMLoc::getFromPointer(Loc), Msg, "error");
+}
+
+void PrintError(const Twine &Msg) {
+  errs() << "error:" << Msg << "\n";
+}
+
+void PrintError(const TGError &Error) {
+  PrintError(Error.getLoc(), Error.getMessage());
+}
+
+} // end namespace llvm
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
new file mode 100644
index 0000000..01bc55e
--- /dev/null
+++ b/lib/TableGen/Main.cpp
@@ -0,0 +1,124 @@
+//===- Main.cpp - Top-Level TableGen implementation -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TableGen is a tool which can be used to build up a description of something,
+// then invoke one or more "tablegen backends" to emit information about the
+// description in some predefined format.  In practice, this is used by the LLVM
+// code generators to automate generation of a code generator through a
+// high-level description of the target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TGParser.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenAction.h"
+#include <algorithm>
+#include <cstdio>
+using namespace llvm;
+
+namespace {
+  cl::opt<std::string>
+  OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"),
+                 cl::init("-"));
+
+  cl::opt<std::string>
+  DependFilename("d", cl::desc("Dependency filename"), cl::value_desc("filename"),
+                 cl::init(""));
+
+  cl::opt<std::string>
+  InputFilename(cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+  cl::list<std::string>
+  IncludeDirs("I", cl::desc("Directory of include files"),
+              cl::value_desc("directory"), cl::Prefix);
+}
+
+namespace llvm {
+
+int TableGenMain(char *argv0, TableGenAction &Action) {
+  RecordKeeper Records;
+
+  try {
+    // Parse the input file.
+    OwningPtr<MemoryBuffer> File;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+      errs() << "Could not open input file '" << InputFilename << "': "
+             << ec.message() <<"\n";
+      return 1;
+    }
+    MemoryBuffer *F = File.take();
+
+    // Tell SrcMgr about this buffer, which is what TGParser will pick up.
+    SrcMgr.AddNewSourceBuffer(F, SMLoc());
+
+    // Record the location of the include directory so that the lexer can find
+    // it later.
+    SrcMgr.setIncludeDirs(IncludeDirs);
+
+    TGParser Parser(SrcMgr, Records);
+
+    if (Parser.ParseFile())
+      return 1;
+
+    std::string Error;
+    tool_output_file Out(OutputFilename.c_str(), Error);
+    if (!Error.empty()) {
+      errs() << argv0 << ": error opening " << OutputFilename
+        << ":" << Error << "\n";
+      return 1;
+    }
+    if (!DependFilename.empty()) {
+      if (OutputFilename == "-") {
+        errs() << argv0 << ": the option -d must be used together with -o\n";
+        return 1;
+      }
+      tool_output_file DepOut(DependFilename.c_str(), Error);
+      if (!Error.empty()) {
+        errs() << argv0 << ": error opening " << DependFilename
+          << ":" << Error << "\n";
+        return 1;
+      }
+      DepOut.os() << OutputFilename << ":";
+      const std::vector<std::string> &Dependencies = Parser.getDependencies();
+      for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
+                                                          E = Dependencies.end();
+           I != E; ++I) {
+        DepOut.os() << " " << (*I);
+      }
+      DepOut.os() << "\n";
+      DepOut.keep();
+    }
+
+    if (Action(Out.os(), Records))
+      return 1;
+
+    // Declare success.
+    Out.keep();
+    return 0;
+
+  } catch (const TGError &Error) {
+    PrintError(Error);
+  } catch (const std::string &Error) {
+    PrintError(Error);
+  } catch (const char *Error) {
+    PrintError(Error);
+  } catch (...) {
+    errs() << argv0 << ": Unknown unexpected exception occurred.\n";
+  }
+
+  return 1;
+}
+
+}
diff --git a/lib/TableGen/Makefile b/lib/TableGen/Makefile
new file mode 100644
index 0000000..4472438
--- /dev/null
+++ b/lib/TableGen/Makefile
@@ -0,0 +1,18 @@
+##===- lib/TableGen/Makefile -------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMTableGen
+BUILD_ARCHIVE = 1
+
+## FIXME: This only requires RTTI because tblgen uses it.  Fix that.
+REQUIRES_RTTI = 1
+REQUIRES_EH = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
new file mode 100644
index 0000000..b7c51ca
--- /dev/null
+++ b/lib/TableGen/Record.cpp
@@ -0,0 +1,2019 @@
+//===- Record.cpp - Record implementation ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the tablegen record classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//    std::string wrapper for DenseMap purposes
+//===----------------------------------------------------------------------===//
+
+/// TableGenStringKey - This is a wrapper for std::string suitable for
+/// using as a key to a DenseMap.  Because there isn't a particularly
+/// good way to indicate tombstone or empty keys for strings, we want
+/// to wrap std::string to indicate that this is a "special" string
+/// not expected to take on certain values (those of the tombstone and
+/// empty keys).  This makes things a little safer as it clarifies
+/// that DenseMap is really not appropriate for general strings.
+
+class TableGenStringKey {
+public:
+  TableGenStringKey(const std::string &str) : data(str) {}
+  TableGenStringKey(const char *str) : data(str) {}
+
+  const std::string &str() const { return data; }
+  
+private:
+  std::string data;
+};
+
+/// Specialize DenseMapInfo for TableGenStringKey.
+namespace llvm {
+
+template<> struct DenseMapInfo<TableGenStringKey> {
+  static inline TableGenStringKey getEmptyKey() {
+    TableGenStringKey Empty("<<<EMPTY KEY>>>");
+    return Empty;
+  }
+  static inline TableGenStringKey getTombstoneKey() {
+    TableGenStringKey Tombstone("<<<TOMBSTONE KEY>>>");
+    return Tombstone;
+  }
+  static unsigned getHashValue(const TableGenStringKey& Val) {
+    return HashString(Val.str());
+  }
+  static bool isEqual(const TableGenStringKey& LHS,
+                      const TableGenStringKey& RHS) {
+    return LHS.str() == RHS.str();
+  }
+};
+
+}
+
+//===----------------------------------------------------------------------===//
+//    Type implementations
+//===----------------------------------------------------------------------===//
+
+BitRecTy BitRecTy::Shared;
+IntRecTy IntRecTy::Shared;
+StringRecTy StringRecTy::Shared;
+CodeRecTy CodeRecTy::Shared;
+DagRecTy DagRecTy::Shared;
+
+void RecTy::dump() const { print(errs()); }
+
+ListRecTy *RecTy::getListTy() {
+  if (!ListTy)
+    ListTy = new ListRecTy(this);
+  return ListTy;
+}
+
+Init *BitRecTy::convertValue(BitsInit *BI) {
+  if (BI->getNumBits() != 1) return 0; // Only accept if just one bit!
+  return BI->getBit(0);
+}
+
+bool BitRecTy::baseClassOf(const BitsRecTy *RHS) const {
+  return RHS->getNumBits() == 1;
+}
+
+Init *BitRecTy::convertValue(IntInit *II) {
+  int64_t Val = II->getValue();
+  if (Val != 0 && Val != 1) return 0;  // Only accept 0 or 1 for a bit!
+
+  return BitInit::get(Val != 0);
+}
+
+Init *BitRecTy::convertValue(TypedInit *VI) {
+  if (dynamic_cast<BitRecTy*>(VI->getType()))
+    return VI;  // Accept variable if it is already of bit type!
+  return 0;
+}
+
+BitsRecTy *BitsRecTy::get(unsigned Sz) {
+  static std::vector<BitsRecTy*> Shared;
+  if (Sz >= Shared.size())
+    Shared.resize(Sz + 1);
+  BitsRecTy *&Ty = Shared[Sz];
+  if (!Ty)
+    Ty = new BitsRecTy(Sz);
+  return Ty;
+}
+
+std::string BitsRecTy::getAsString() const {
+  return "bits<" + utostr(Size) + ">";
+}
+
+Init *BitsRecTy::convertValue(UnsetInit *UI) {
+  SmallVector<Init *, 16> NewBits(Size);
+
+  for (unsigned i = 0; i != Size; ++i)
+    NewBits[i] = UnsetInit::get();
+
+  return BitsInit::get(NewBits);
+}
+
+Init *BitsRecTy::convertValue(BitInit *UI) {
+  if (Size != 1) return 0;  // Can only convert single bit.
+          return BitsInit::get(UI);
+}
+
+/// canFitInBitfield - Return true if the number of bits is large enough to hold
+/// the integer value.
+static bool canFitInBitfield(int64_t Value, unsigned NumBits) {
+  // For example, with NumBits == 4, we permit Values from [-7 .. 15].
+  return (NumBits >= sizeof(Value) * 8) ||
+         (Value >> NumBits == 0) || (Value >> (NumBits-1) == -1);
+}
+
+/// convertValue from Int initializer to bits type: Split the integer up into the
+/// appropriate bits.
+///
+Init *BitsRecTy::convertValue(IntInit *II) {
+  int64_t Value = II->getValue();
+  // Make sure this bitfield is large enough to hold the integer value.
+  if (!canFitInBitfield(Value, Size))
+    return 0;
+
+  SmallVector<Init *, 16> NewBits(Size);
+
+  for (unsigned i = 0; i != Size; ++i)
+    NewBits[i] = BitInit::get(Value & (1LL << i));
+
+  return BitsInit::get(NewBits);
+}
+
+Init *BitsRecTy::convertValue(BitsInit *BI) {
+  // If the number of bits is right, return it.  Otherwise we need to expand or
+  // truncate.
+  if (BI->getNumBits() == Size) return BI;
+  return 0;
+}
+
+Init *BitsRecTy::convertValue(TypedInit *VI) {
+  if (BitsRecTy *BRT = dynamic_cast<BitsRecTy*>(VI->getType()))
+    if (BRT->Size == Size) {
+      SmallVector<Init *, 16> NewBits(Size);
+ 
+      for (unsigned i = 0; i != Size; ++i)
+        NewBits[i] = VarBitInit::get(VI, i);
+      return BitsInit::get(NewBits);
+    }
+
+  if (Size == 1 && dynamic_cast<BitRecTy*>(VI->getType()))
+    return BitsInit::get(VI);
+
+  if (TernOpInit *Tern = dynamic_cast<TernOpInit*>(VI)) {
+    if (Tern->getOpcode() == TernOpInit::IF) {
+      Init *LHS = Tern->getLHS();
+      Init *MHS = Tern->getMHS();
+      Init *RHS = Tern->getRHS();
+
+      IntInit *MHSi = dynamic_cast<IntInit*>(MHS);
+      IntInit *RHSi = dynamic_cast<IntInit*>(RHS);
+
+      if (MHSi && RHSi) {
+        int64_t MHSVal = MHSi->getValue();
+        int64_t RHSVal = RHSi->getValue();
+
+        if (canFitInBitfield(MHSVal, Size) && canFitInBitfield(RHSVal, Size)) {
+          SmallVector<Init *, 16> NewBits(Size);
+
+          for (unsigned i = 0; i != Size; ++i)
+            NewBits[i] =
+              TernOpInit::get(TernOpInit::IF, LHS,
+                              IntInit::get((MHSVal & (1LL << i)) ? 1 : 0),
+                              IntInit::get((RHSVal & (1LL << i)) ? 1 : 0),
+                              VI->getType());
+
+          return BitsInit::get(NewBits);
+        }
+      } else {
+        BitsInit *MHSbs = dynamic_cast<BitsInit*>(MHS);
+        BitsInit *RHSbs = dynamic_cast<BitsInit*>(RHS);
+
+        if (MHSbs && RHSbs) {
+          SmallVector<Init *, 16> NewBits(Size);
+
+          for (unsigned i = 0; i != Size; ++i)
+            NewBits[i] = TernOpInit::get(TernOpInit::IF, LHS,
+                                         MHSbs->getBit(i),
+                                         RHSbs->getBit(i),
+                                         VI->getType());
+
+          return BitsInit::get(NewBits);
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+Init *IntRecTy::convertValue(BitInit *BI) {
+  return IntInit::get(BI->getValue());
+}
+
+Init *IntRecTy::convertValue(BitsInit *BI) {
+  int64_t Result = 0;
+  for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i)
+    if (BitInit *Bit = dynamic_cast<BitInit*>(BI->getBit(i))) {
+      Result |= Bit->getValue() << i;
+    } else {
+      return 0;
+    }
+  return IntInit::get(Result);
+}
+
+Init *IntRecTy::convertValue(TypedInit *TI) {
+  if (TI->getType()->typeIsConvertibleTo(this))
+    return TI;  // Accept variable if already of the right type!
+  return 0;
+}
+
+Init *StringRecTy::convertValue(UnOpInit *BO) {
+  if (BO->getOpcode() == UnOpInit::CAST) {
+    Init *L = BO->getOperand()->convertInitializerTo(this);
+    if (L == 0) return 0;
+    if (L != BO->getOperand())
+      return UnOpInit::get(UnOpInit::CAST, L, new StringRecTy);
+    return BO;
+  }
+
+  return convertValue((TypedInit*)BO);
+}
+
+Init *StringRecTy::convertValue(BinOpInit *BO) {
+  if (BO->getOpcode() == BinOpInit::STRCONCAT) {
+    Init *L = BO->getLHS()->convertInitializerTo(this);
+    Init *R = BO->getRHS()->convertInitializerTo(this);
+    if (L == 0 || R == 0) return 0;
+    if (L != BO->getLHS() || R != BO->getRHS())
+      return BinOpInit::get(BinOpInit::STRCONCAT, L, R, new StringRecTy);
+    return BO;
+  }
+
+  return convertValue((TypedInit*)BO);
+}
+
+
+Init *StringRecTy::convertValue(TypedInit *TI) {
+  if (dynamic_cast<StringRecTy*>(TI->getType()))
+    return TI;  // Accept variable if already of the right type!
+  return 0;
+}
+
+std::string ListRecTy::getAsString() const {
+  return "list<" + Ty->getAsString() + ">";
+}
+
+Init *ListRecTy::convertValue(ListInit *LI) {
+  std::vector<Init*> Elements;
+
+  // Verify that all of the elements of the list are subclasses of the
+  // appropriate class!
+  for (unsigned i = 0, e = LI->getSize(); i != e; ++i)
+    if (Init *CI = LI->getElement(i)->convertInitializerTo(Ty))
+      Elements.push_back(CI);
+    else
+      return 0;
+
+  ListRecTy *LType = dynamic_cast<ListRecTy*>(LI->getType());
+  if (LType == 0) {
+    return 0;
+  }
+
+  return ListInit::get(Elements, this);
+}
+
+Init *ListRecTy::convertValue(TypedInit *TI) {
+  // Ensure that TI is compatible with our class.
+  if (ListRecTy *LRT = dynamic_cast<ListRecTy*>(TI->getType()))
+    if (LRT->getElementType()->typeIsConvertibleTo(getElementType()))
+      return TI;
+  return 0;
+}
+
+Init *CodeRecTy::convertValue(TypedInit *TI) {
+  if (TI->getType()->typeIsConvertibleTo(this))
+    return TI;
+  return 0;
+}
+
+Init *DagRecTy::convertValue(TypedInit *TI) {
+  if (TI->getType()->typeIsConvertibleTo(this))
+    return TI;
+  return 0;
+}
+
+Init *DagRecTy::convertValue(UnOpInit *BO) {
+  if (BO->getOpcode() == UnOpInit::CAST) {
+    Init *L = BO->getOperand()->convertInitializerTo(this);
+    if (L == 0) return 0;
+    if (L != BO->getOperand())
+      return UnOpInit::get(UnOpInit::CAST, L, new DagRecTy);
+    return BO;
+  }
+  return 0;
+}
+
+Init *DagRecTy::convertValue(BinOpInit *BO) {
+  if (BO->getOpcode() == BinOpInit::CONCAT) {
+    Init *L = BO->getLHS()->convertInitializerTo(this);
+    Init *R = BO->getRHS()->convertInitializerTo(this);
+    if (L == 0 || R == 0) return 0;
+    if (L != BO->getLHS() || R != BO->getRHS())
+      return BinOpInit::get(BinOpInit::CONCAT, L, R, new DagRecTy);
+    return BO;
+  }
+  return 0;
+}
+
+RecordRecTy *RecordRecTy::get(Record *R) {
+  return &dynamic_cast<RecordRecTy&>(*R->getDefInit()->getType());
+}
+
+std::string RecordRecTy::getAsString() const {
+  return Rec->getName();
+}
+
+Init *RecordRecTy::convertValue(DefInit *DI) {
+  // Ensure that DI is a subclass of Rec.
+  if (!DI->getDef()->isSubClassOf(Rec))
+    return 0;
+  return DI;
+}
+
+Init *RecordRecTy::convertValue(TypedInit *TI) {
+  // Ensure that TI is compatible with Rec.
+  if (RecordRecTy *RRT = dynamic_cast<RecordRecTy*>(TI->getType()))
+    if (RRT->getRecord()->isSubClassOf(getRecord()) ||
+        RRT->getRecord() == getRecord())
+      return TI;
+  return 0;
+}
+
+bool RecordRecTy::baseClassOf(const RecordRecTy *RHS) const {
+  if (Rec == RHS->getRecord() || RHS->getRecord()->isSubClassOf(Rec))
+    return true;
+
+  const std::vector<Record*> &SC = Rec->getSuperClasses();
+  for (unsigned i = 0, e = SC.size(); i != e; ++i)
+    if (RHS->getRecord()->isSubClassOf(SC[i]))
+      return true;
+
+  return false;
+}
+
+
+/// resolveTypes - Find a common type that T1 and T2 convert to.
+/// Return 0 if no such type exists.
+///
+RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
+  if (!T1->typeIsConvertibleTo(T2)) {
+    if (!T2->typeIsConvertibleTo(T1)) {
+      // If one is a Record type, check superclasses
+      RecordRecTy *RecTy1 = dynamic_cast<RecordRecTy*>(T1);
+      if (RecTy1) {
+        // See if T2 inherits from a type T1 also inherits from
+        const std::vector<Record *> &T1SuperClasses =
+          RecTy1->getRecord()->getSuperClasses();
+        for(std::vector<Record *>::const_iterator i = T1SuperClasses.begin(),
+              iend = T1SuperClasses.end();
+            i != iend;
+            ++i) {
+          RecordRecTy *SuperRecTy1 = RecordRecTy::get(*i);
+          RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
+          if (NewType1 != 0) {
+            if (NewType1 != SuperRecTy1) {
+              delete SuperRecTy1;
+            }
+            return NewType1;
+          }
+        }
+      }
+      RecordRecTy *RecTy2 = dynamic_cast<RecordRecTy*>(T2);
+      if (RecTy2) {
+        // See if T1 inherits from a type T2 also inherits from
+        const std::vector<Record *> &T2SuperClasses =
+          RecTy2->getRecord()->getSuperClasses();
+        for (std::vector<Record *>::const_iterator i = T2SuperClasses.begin(),
+              iend = T2SuperClasses.end();
+            i != iend;
+            ++i) {
+          RecordRecTy *SuperRecTy2 = RecordRecTy::get(*i);
+          RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
+          if (NewType2 != 0) {
+            if (NewType2 != SuperRecTy2) {
+              delete SuperRecTy2;
+            }
+            return NewType2;
+          }
+        }
+      }
+      return 0;
+    }
+    return T2;
+  }
+  return T1;
+}
+
+
+//===----------------------------------------------------------------------===//
+//    Initializer implementations
+//===----------------------------------------------------------------------===//
+
+void Init::dump() const { return print(errs()); }
+
+UnsetInit *UnsetInit::get() {
+  static UnsetInit TheInit;
+  return &TheInit;
+}
+
+BitInit *BitInit::get(bool V) {
+  static BitInit True(true);
+  static BitInit False(false);
+
+  return V ? &True : &False;
+}
+
+static void
+ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef<Init *> Range) {
+  ID.AddInteger(Range.size());
+
+  for (ArrayRef<Init *>::iterator i = Range.begin(),
+         iend = Range.end();
+       i != iend;
+       ++i)
+    ID.AddPointer(*i);
+}
+
+BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
+  typedef FoldingSet<BitsInit> Pool;
+  static Pool ThePool;  
+
+  FoldingSetNodeID ID;
+  ProfileBitsInit(ID, Range);
+
+  void *IP = 0;
+  if (BitsInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  BitsInit *I = new BitsInit(Range);
+  ThePool.InsertNode(I, IP);
+
+  return I;
+}
+
+void BitsInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileBitsInit(ID, Bits);
+}
+
+Init *
+BitsInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
+  SmallVector<Init *, 16> NewBits(Bits.size());
+
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    if (Bits[i] >= getNumBits())
+      return 0;
+    NewBits[i] = getBit(Bits[i]);
+  }
+  return BitsInit::get(NewBits);
+}
+
+std::string BitsInit::getAsString() const {
+  std::string Result = "{ ";
+  for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
+    if (i) Result += ", ";
+    if (Init *Bit = getBit(e-i-1))
+      Result += Bit->getAsString();
+    else
+      Result += "*";
+  }
+  return Result + " }";
+}
+
+// resolveReferences - If there are any field references that refer to fields
+// that have been filled in, we can propagate the values now.
+//
+Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  bool Changed = false;
+  SmallVector<Init *, 16> NewBits(getNumBits());
+
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    Init *B;
+    Init *CurBit = getBit(i);
+
+    do {
+      B = CurBit;
+      CurBit = CurBit->resolveReferences(R, RV);
+      Changed |= B != CurBit;
+    } while (B != CurBit);
+    NewBits[i] = CurBit;
+  }
+
+  if (Changed)
+    return BitsInit::get(NewBits);
+
+  return const_cast<BitsInit *>(this);
+}
+
+IntInit *IntInit::get(int64_t V) {
+  typedef DenseMap<int64_t, IntInit *> Pool;
+  static Pool ThePool;
+
+  IntInit *&I = ThePool[V];
+  if (!I) I = new IntInit(V);
+  return I;
+}
+
+std::string IntInit::getAsString() const {
+  return itostr(Value);
+}
+
+Init *
+IntInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
+  SmallVector<Init *, 16> NewBits(Bits.size());
+
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    if (Bits[i] >= 64)
+      return 0;
+
+    NewBits[i] = BitInit::get(Value & (INT64_C(1) << Bits[i]));
+  }
+  return BitsInit::get(NewBits);
+}
+
+StringInit *StringInit::get(const std::string &V) {
+  typedef StringMap<StringInit *> Pool;
+  static Pool ThePool;
+
+  StringInit *&I = ThePool[V];
+  if (!I) I = new StringInit(V);
+  return I;
+}
+
+CodeInit *CodeInit::get(const std::string &V) {
+  typedef StringMap<CodeInit *> Pool;
+  static Pool ThePool;
+
+  CodeInit *&I = ThePool[V];
+  if (!I) I = new CodeInit(V);
+  return I;
+}
+
+static void ProfileListInit(FoldingSetNodeID &ID,
+                            ArrayRef<Init *> Range,
+                            RecTy *EltTy) {
+  ID.AddInteger(Range.size());
+  ID.AddPointer(EltTy);
+
+  for (ArrayRef<Init *>::iterator i = Range.begin(),
+         iend = Range.end();
+       i != iend;
+       ++i)
+    ID.AddPointer(*i);
+}
+
+ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
+  typedef FoldingSet<ListInit> Pool;
+  static Pool ThePool;
+
+  // Just use the FoldingSetNodeID to compute a hash.  Use a DenseMap
+  // for actual storage.
+  FoldingSetNodeID ID;
+  ProfileListInit(ID, Range, EltTy);
+
+  void *IP = 0;
+  if (ListInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  ListInit *I = new ListInit(Range, EltTy);
+  ThePool.InsertNode(I, IP);
+  return I;
+}
+
+void ListInit::Profile(FoldingSetNodeID &ID) const {
+  ListRecTy *ListType = dynamic_cast<ListRecTy *>(getType());
+  assert(ListType && "Bad type for ListInit!");
+  RecTy *EltTy = ListType->getElementType();
+
+  ProfileListInit(ID, Values, EltTy);
+}
+
+Init *
+ListInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
+  std::vector<Init*> Vals;
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+    if (Elements[i] >= getSize())
+      return 0;
+    Vals.push_back(getElement(Elements[i]));
+  }
+  return ListInit::get(Vals, getType());
+}
+
+Record *ListInit::getElementAsRecord(unsigned i) const {
+  assert(i < Values.size() && "List element index out of range!");
+  DefInit *DI = dynamic_cast<DefInit*>(Values[i]);
+  if (DI == 0) throw "Expected record in list!";
+  return DI->getDef();
+}
+
+Init *ListInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  std::vector<Init*> Resolved;
+  Resolved.reserve(getSize());
+  bool Changed = false;
+
+  for (unsigned i = 0, e = getSize(); i != e; ++i) {
+    Init *E;
+    Init *CurElt = getElement(i);
+
+    do {
+      E = CurElt;
+      CurElt = CurElt->resolveReferences(R, RV);
+      Changed |= E != CurElt;
+    } while (E != CurElt);
+    Resolved.push_back(E);
+  }
+
+  if (Changed)
+    return ListInit::get(Resolved, getType());
+  return const_cast<ListInit *>(this);
+}
+
+Init *ListInit::resolveListElementReference(Record &R, const RecordVal *IRV,
+                                            unsigned Elt) const {
+  if (Elt >= getSize())
+    return 0;  // Out of range reference.
+  Init *E = getElement(Elt);
+  // If the element is set to some value, or if we are resolving a reference
+  // to a specific variable and that variable is explicitly unset, then
+  // replace the VarListElementInit with it.
+  if (IRV || !dynamic_cast<UnsetInit*>(E))
+    return E;
+  return 0;
+}
+
+std::string ListInit::getAsString() const {
+  std::string Result = "[";
+  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+    if (i) Result += ", ";
+    Result += Values[i]->getAsString();
+  }
+  return Result + "]";
+}
+
+Init *OpInit::resolveBitReference(Record &R, const RecordVal *IRV,
+                                  unsigned Bit) const {
+  Init *Folded = Fold(&R, 0);
+
+  if (Folded != this) {
+    TypedInit *Typed = dynamic_cast<TypedInit *>(Folded);
+    if (Typed) {
+      return Typed->resolveBitReference(R, IRV, Bit);
+    }
+  }
+
+  return 0;
+}
+
+Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
+                                          unsigned Elt) const {
+  Init *Resolved = resolveReferences(R, IRV);
+  OpInit *OResolved = dynamic_cast<OpInit *>(Resolved);
+  if (OResolved) {
+    Resolved = OResolved->Fold(&R, 0);
+  }
+
+  if (Resolved != this) {
+    TypedInit *Typed = dynamic_cast<TypedInit *>(Resolved); 
+    assert(Typed && "Expected typed init for list reference");
+    if (Typed) {
+      Init *New = Typed->resolveListElementReference(R, IRV, Elt);
+      if (New)
+        return New;
+      return VarListElementInit::get(Typed, Elt);
+    }
+  }
+
+  return 0;
+}
+
+UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
+  typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
+
+  typedef DenseMap<Key, UnOpInit *> Pool;
+  static Pool ThePool;  
+
+  Key TheKey(std::make_pair(std::make_pair(opc, lhs), Type));
+
+  UnOpInit *&I = ThePool[TheKey];
+  if (!I) I = new UnOpInit(opc, lhs, Type);
+  return I;
+}
+
+Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+  switch (getOpcode()) {
+  default: assert(0 && "Unknown unop");
+  case CAST: {
+    if (getType()->getAsString() == "string") {
+      StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+      if (LHSs) {
+        return LHSs;
+      }
+
+      DefInit *LHSd = dynamic_cast<DefInit*>(LHS);
+      if (LHSd) {
+        return StringInit::get(LHSd->getDef()->getName());
+      }
+    } else {
+      StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+      if (LHSs) {
+        std::string Name = LHSs->getValue();
+
+        // From TGParser::ParseIDValue
+        if (CurRec) {
+          if (const RecordVal *RV = CurRec->getValue(Name)) {
+            if (RV->getType() != getType())
+              throw "type mismatch in cast";
+            return VarInit::get(Name, RV->getType());
+          }
+
+          std::string TemplateArgName = CurRec->getName()+":"+Name;
+          if (CurRec->isTemplateArg(TemplateArgName)) {
+            const RecordVal *RV = CurRec->getValue(TemplateArgName);
+            assert(RV && "Template arg doesn't exist??");
+
+            if (RV->getType() != getType())
+              throw "type mismatch in cast";
+
+            return VarInit::get(TemplateArgName, RV->getType());
+          }
+        }
+
+        if (CurMultiClass) {
+          std::string MCName = CurMultiClass->Rec.getName()+"::"+Name;
+          if (CurMultiClass->Rec.isTemplateArg(MCName)) {
+            const RecordVal *RV = CurMultiClass->Rec.getValue(MCName);
+            assert(RV && "Template arg doesn't exist??");
+
+            if (RV->getType() != getType())
+              throw "type mismatch in cast";
+
+            return VarInit::get(MCName, RV->getType());
+          }
+        }
+
+        if (Record *D = (CurRec->getRecords()).getDef(Name))
+          return DefInit::get(D);
+
+        throw TGError(CurRec->getLoc(), "Undefined reference:'" + Name + "'\n");
+      }
+    }
+    break;
+  }
+  case HEAD: {
+    ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
+    if (LHSl) {
+      if (LHSl->getSize() == 0) {
+        assert(0 && "Empty list in car");
+        return 0;
+      }
+      return LHSl->getElement(0);
+    }
+    break;
+  }
+  case TAIL: {
+    ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
+    if (LHSl) {
+      if (LHSl->getSize() == 0) {
+        assert(0 && "Empty list in cdr");
+        return 0;
+      }
+      // Note the +1.  We can't just pass the result of getValues()
+      // directly.
+      ArrayRef<Init *>::iterator begin = LHSl->getValues().begin()+1;
+      ArrayRef<Init *>::iterator end   = LHSl->getValues().end();
+      ListInit *Result =
+        ListInit::get(ArrayRef<Init *>(begin, end - begin),
+                      LHSl->getType());
+      return Result;
+    }
+    break;
+  }
+  case EMPTY: {
+    ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
+    if (LHSl) {
+      if (LHSl->getSize() == 0) {
+        return IntInit::get(1);
+      } else {
+        return IntInit::get(0);
+      }
+    }
+    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+    if (LHSs) {
+      if (LHSs->getValue().empty()) {
+        return IntInit::get(1);
+      } else {
+        return IntInit::get(0);
+      }
+    }
+
+    break;
+  }
+  }
+  return const_cast<UnOpInit *>(this);
+}
+
+Init *UnOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  Init *lhs = LHS->resolveReferences(R, RV);
+
+  if (LHS != lhs)
+    return (UnOpInit::get(getOpcode(), lhs, getType()))->Fold(&R, 0);
+  return Fold(&R, 0);
+}
+
+std::string UnOpInit::getAsString() const {
+  std::string Result;
+  switch (Opc) {
+  case CAST: Result = "!cast<" + getType()->getAsString() + ">"; break;
+  case HEAD: Result = "!head"; break;
+  case TAIL: Result = "!tail"; break;
+  case EMPTY: Result = "!empty"; break;
+  }
+  return Result + "(" + LHS->getAsString() + ")";
+}
+
+BinOpInit *BinOpInit::get(BinaryOp opc, Init *lhs,
+                          Init *rhs, RecTy *Type) {
+  typedef std::pair<
+    std::pair<std::pair<unsigned, Init *>, Init *>,
+    RecTy *
+    > Key;
+
+  typedef DenseMap<Key, BinOpInit *> Pool;
+  static Pool ThePool;  
+
+  Key TheKey(std::make_pair(std::make_pair(std::make_pair(opc, lhs), rhs),
+                            Type));
+
+  BinOpInit *&I = ThePool[TheKey];
+  if (!I) I = new BinOpInit(opc, lhs, rhs, Type);
+  return I;
+}
+
+Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+  switch (getOpcode()) {
+  default: assert(0 && "Unknown binop");
+  case CONCAT: {
+    DagInit *LHSs = dynamic_cast<DagInit*>(LHS);
+    DagInit *RHSs = dynamic_cast<DagInit*>(RHS);
+    if (LHSs && RHSs) {
+      DefInit *LOp = dynamic_cast<DefInit*>(LHSs->getOperator());
+      DefInit *ROp = dynamic_cast<DefInit*>(RHSs->getOperator());
+      if (LOp == 0 || ROp == 0 || LOp->getDef() != ROp->getDef())
+        throw "Concated Dag operators do not match!";
+      std::vector<Init*> Args;
+      std::vector<std::string> ArgNames;
+      for (unsigned i = 0, e = LHSs->getNumArgs(); i != e; ++i) {
+        Args.push_back(LHSs->getArg(i));
+        ArgNames.push_back(LHSs->getArgName(i));
+      }
+      for (unsigned i = 0, e = RHSs->getNumArgs(); i != e; ++i) {
+        Args.push_back(RHSs->getArg(i));
+        ArgNames.push_back(RHSs->getArgName(i));
+      }
+      return DagInit::get(LHSs->getOperator(), "", Args, ArgNames);
+    }
+    break;
+  }
+  case STRCONCAT: {
+    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+    StringInit *RHSs = dynamic_cast<StringInit*>(RHS);
+    if (LHSs && RHSs)
+      return StringInit::get(LHSs->getValue() + RHSs->getValue());
+    break;
+  }
+  case EQ: {
+    // try to fold eq comparison for 'bit' and 'int', otherwise fallback
+    // to string objects.
+    IntInit* L =
+      dynamic_cast<IntInit*>(LHS->convertInitializerTo(IntRecTy::get()));
+    IntInit* R =
+      dynamic_cast<IntInit*>(RHS->convertInitializerTo(IntRecTy::get()));
+
+    if (L && R)
+      return IntInit::get(L->getValue() == R->getValue());
+
+    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+    StringInit *RHSs = dynamic_cast<StringInit*>(RHS);
+
+    // Make sure we've resolved
+    if (LHSs && RHSs)
+      return IntInit::get(LHSs->getValue() == RHSs->getValue());
+
+    break;
+  }
+  case SHL:
+  case SRA:
+  case SRL: {
+    IntInit *LHSi = dynamic_cast<IntInit*>(LHS);
+    IntInit *RHSi = dynamic_cast<IntInit*>(RHS);
+    if (LHSi && RHSi) {
+      int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue();
+      int64_t Result;
+      switch (getOpcode()) {
+      default: assert(0 && "Bad opcode!");
+      case SHL: Result = LHSv << RHSv; break;
+      case SRA: Result = LHSv >> RHSv; break;
+      case SRL: Result = (uint64_t)LHSv >> (uint64_t)RHSv; break;
+      }
+      return IntInit::get(Result);
+    }
+    break;
+  }
+  }
+  return const_cast<BinOpInit *>(this);
+}
+
+Init *BinOpInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  Init *lhs = LHS->resolveReferences(R, RV);
+  Init *rhs = RHS->resolveReferences(R, RV);
+
+  if (LHS != lhs || RHS != rhs)
+    return (BinOpInit::get(getOpcode(), lhs, rhs, getType()))->Fold(&R, 0);
+  return Fold(&R, 0);
+}
+
+std::string BinOpInit::getAsString() const {
+  std::string Result;
+  switch (Opc) {
+  case CONCAT: Result = "!con"; break;
+  case SHL: Result = "!shl"; break;
+  case SRA: Result = "!sra"; break;
+  case SRL: Result = "!srl"; break;
+  case EQ: Result = "!eq"; break;
+  case STRCONCAT: Result = "!strconcat"; break;
+  }
+  return Result + "(" + LHS->getAsString() + ", " + RHS->getAsString() + ")";
+}
+
+TernOpInit *TernOpInit::get(TernaryOp opc, Init *lhs,
+                                  Init *mhs, Init *rhs,
+                                  RecTy *Type) {
+  typedef std::pair<
+    std::pair<
+      std::pair<std::pair<unsigned, RecTy *>, Init *>,
+      Init *
+      >,
+    Init *
+    > Key;
+
+  typedef DenseMap<Key, TernOpInit *> Pool;
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(std::make_pair(std::make_pair(std::make_pair(opc,
+                                                                         Type),
+                                                          lhs),
+                                           mhs),
+                            rhs));
+
+  TernOpInit *&I = ThePool[TheKey];
+  if (!I) I = new TernOpInit(opc, lhs, mhs, rhs, Type);
+  return I;
+}
+
+static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+                           Record *CurRec, MultiClass *CurMultiClass);
+
+static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
+                               RecTy *Type, Record *CurRec,
+                               MultiClass *CurMultiClass) {
+  std::vector<Init *> NewOperands;
+
+  TypedInit *TArg = dynamic_cast<TypedInit*>(Arg);
+
+  // If this is a dag, recurse
+  if (TArg && TArg->getType()->getAsString() == "dag") {
+    Init *Result = ForeachHelper(LHS, Arg, RHSo, Type,
+                                 CurRec, CurMultiClass);
+    if (Result != 0) {
+      return Result;
+    } else {
+      return 0;
+    }
+  }
+
+  for (int i = 0; i < RHSo->getNumOperands(); ++i) {
+    OpInit *RHSoo = dynamic_cast<OpInit*>(RHSo->getOperand(i));
+
+    if (RHSoo) {
+      Init *Result = EvaluateOperation(RHSoo, LHS, Arg,
+                                       Type, CurRec, CurMultiClass);
+      if (Result != 0) {
+        NewOperands.push_back(Result);
+      } else {
+        NewOperands.push_back(Arg);
+      }
+    } else if (LHS->getAsString() == RHSo->getOperand(i)->getAsString()) {
+      NewOperands.push_back(Arg);
+    } else {
+      NewOperands.push_back(RHSo->getOperand(i));
+    }
+  }
+
+  // Now run the operator and use its result as the new leaf
+  const OpInit *NewOp = RHSo->clone(NewOperands);
+  Init *NewVal = NewOp->Fold(CurRec, CurMultiClass);
+  if (NewVal != NewOp)
+    return NewVal;
+
+  return 0;
+}
+
+static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
+                           Record *CurRec, MultiClass *CurMultiClass) {
+  DagInit *MHSd = dynamic_cast<DagInit*>(MHS);
+  ListInit *MHSl = dynamic_cast<ListInit*>(MHS);
+
+  DagRecTy *DagType = dynamic_cast<DagRecTy*>(Type);
+  ListRecTy *ListType = dynamic_cast<ListRecTy*>(Type);
+
+  OpInit *RHSo = dynamic_cast<OpInit*>(RHS);
+
+  if (!RHSo) {
+    throw TGError(CurRec->getLoc(), "!foreach requires an operator\n");
+  }
+
+  TypedInit *LHSt = dynamic_cast<TypedInit*>(LHS);
+
+  if (!LHSt) {
+    throw TGError(CurRec->getLoc(), "!foreach requires typed variable\n");
+  }
+
+  if ((MHSd && DagType) || (MHSl && ListType)) {
+    if (MHSd) {
+      Init *Val = MHSd->getOperator();
+      Init *Result = EvaluateOperation(RHSo, LHS, Val,
+                                       Type, CurRec, CurMultiClass);
+      if (Result != 0) {
+        Val = Result;
+      }
+
+      std::vector<std::pair<Init *, std::string> > args;
+      for (unsigned int i = 0; i < MHSd->getNumArgs(); ++i) {
+        Init *Arg;
+        std::string ArgName;
+        Arg = MHSd->getArg(i);
+        ArgName = MHSd->getArgName(i);
+
+        // Process args
+        Init *Result = EvaluateOperation(RHSo, LHS, Arg, Type,
+                                         CurRec, CurMultiClass);
+        if (Result != 0) {
+          Arg = Result;
+        }
+
+        // TODO: Process arg names
+        args.push_back(std::make_pair(Arg, ArgName));
+      }
+
+      return DagInit::get(Val, "", args);
+    }
+    if (MHSl) {
+      std::vector<Init *> NewOperands;
+      std::vector<Init *> NewList(MHSl->begin(), MHSl->end());
+
+      for (std::vector<Init *>::iterator li = NewList.begin(),
+             liend = NewList.end();
+           li != liend;
+           ++li) {
+        Init *Item = *li;
+        NewOperands.clear();
+        for(int i = 0; i < RHSo->getNumOperands(); ++i) {
+          // First, replace the foreach variable with the list item
+          if (LHS->getAsString() == RHSo->getOperand(i)->getAsString()) {
+            NewOperands.push_back(Item);
+          } else {
+            NewOperands.push_back(RHSo->getOperand(i));
+          }
+        }
+
+        // Now run the operator and use its result as the new list item
+        const OpInit *NewOp = RHSo->clone(NewOperands);
+        Init *NewItem = NewOp->Fold(CurRec, CurMultiClass);
+        if (NewItem != NewOp)
+          *li = NewItem;
+      }
+      return ListInit::get(NewList, MHSl->getType());
+    }
+  }
+  return 0;
+}
+
+Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
+  switch (getOpcode()) {
+  default: assert(0 && "Unknown binop");
+  case SUBST: {
+    DefInit *LHSd = dynamic_cast<DefInit*>(LHS);
+    VarInit *LHSv = dynamic_cast<VarInit*>(LHS);
+    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+
+    DefInit *MHSd = dynamic_cast<DefInit*>(MHS);
+    VarInit *MHSv = dynamic_cast<VarInit*>(MHS);
+    StringInit *MHSs = dynamic_cast<StringInit*>(MHS);
+
+    DefInit *RHSd = dynamic_cast<DefInit*>(RHS);
+    VarInit *RHSv = dynamic_cast<VarInit*>(RHS);
+    StringInit *RHSs = dynamic_cast<StringInit*>(RHS);
+
+    if ((LHSd && MHSd && RHSd)
+        || (LHSv && MHSv && RHSv)
+        || (LHSs && MHSs && RHSs)) {
+      if (RHSd) {
+        Record *Val = RHSd->getDef();
+        if (LHSd->getAsString() == RHSd->getAsString()) {
+          Val = MHSd->getDef();
+        }
+        return DefInit::get(Val);
+      }
+      if (RHSv) {
+        std::string Val = RHSv->getName();
+        if (LHSv->getAsString() == RHSv->getAsString()) {
+          Val = MHSv->getName();
+        }
+        return VarInit::get(Val, getType());
+      }
+      if (RHSs) {
+        std::string Val = RHSs->getValue();
+
+        std::string::size_type found;
+        std::string::size_type idx = 0;
+        do {
+          found = Val.find(LHSs->getValue(), idx);
+          if (found != std::string::npos) {
+            Val.replace(found, LHSs->getValue().size(), MHSs->getValue());
+          }
+          idx = found +  MHSs->getValue().size();
+        } while (found != std::string::npos);
+
+        return StringInit::get(Val);
+      }
+    }
+    break;
+  }
+
+  case FOREACH: {
+    Init *Result = ForeachHelper(LHS, MHS, RHS, getType(),
+                                 CurRec, CurMultiClass);
+    if (Result != 0) {
+      return Result;
+    }
+    break;
+  }
+
+  case IF: {
+    IntInit *LHSi = dynamic_cast<IntInit*>(LHS);
+    if (Init *I = LHS->convertInitializerTo(IntRecTy::get()))
+      LHSi = dynamic_cast<IntInit*>(I);
+    if (LHSi) {
+      if (LHSi->getValue()) {
+        return MHS;
+      } else {
+        return RHS;
+      }
+    }
+    break;
+  }
+  }
+
+  return const_cast<TernOpInit *>(this);
+}
+
+Init *TernOpInit::resolveReferences(Record &R,
+                                    const RecordVal *RV) const {
+  Init *lhs = LHS->resolveReferences(R, RV);
+
+  if (Opc == IF && lhs != LHS) {
+    IntInit *Value = dynamic_cast<IntInit*>(lhs);
+    if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
+      Value = dynamic_cast<IntInit*>(I);
+    if (Value != 0) {
+      // Short-circuit
+      if (Value->getValue()) {
+        Init *mhs = MHS->resolveReferences(R, RV);
+        return (TernOpInit::get(getOpcode(), lhs, mhs,
+                                RHS, getType()))->Fold(&R, 0);
+      } else {
+        Init *rhs = RHS->resolveReferences(R, RV);
+        return (TernOpInit::get(getOpcode(), lhs, MHS,
+                                rhs, getType()))->Fold(&R, 0);
+      }
+    }
+  }
+
+  Init *mhs = MHS->resolveReferences(R, RV);
+  Init *rhs = RHS->resolveReferences(R, RV);
+
+  if (LHS != lhs || MHS != mhs || RHS != rhs)
+    return (TernOpInit::get(getOpcode(), lhs, mhs, rhs,
+                            getType()))->Fold(&R, 0);
+  return Fold(&R, 0);
+}
+
+std::string TernOpInit::getAsString() const {
+  std::string Result;
+  switch (Opc) {
+  case SUBST: Result = "!subst"; break;
+  case FOREACH: Result = "!foreach"; break;
+  case IF: Result = "!if"; break;
+ }
+  return Result + "(" + LHS->getAsString() + ", " + MHS->getAsString() + ", "
+    + RHS->getAsString() + ")";
+}
+
+RecTy *TypedInit::getFieldType(const std::string &FieldName) const {
+  RecordRecTy *RecordType = dynamic_cast<RecordRecTy *>(getType());
+  if (RecordType) {
+    RecordVal *Field = RecordType->getRecord()->getValue(FieldName);
+    if (Field) {
+      return Field->getType();
+    }
+  }
+  return 0;
+}
+
+Init *
+TypedInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
+  BitsRecTy *T = dynamic_cast<BitsRecTy*>(getType());
+  if (T == 0) return 0;  // Cannot subscript a non-bits variable.
+  unsigned NumBits = T->getNumBits();
+
+  SmallVector<Init *, 16> NewBits(Bits.size());
+  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
+    if (Bits[i] >= NumBits)
+      return 0;
+
+    NewBits[i] = VarBitInit::get(const_cast<TypedInit *>(this), Bits[i]);
+  }
+  return BitsInit::get(NewBits);
+}
+
+Init *
+TypedInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
+  ListRecTy *T = dynamic_cast<ListRecTy*>(getType());
+  if (T == 0) return 0;  // Cannot subscript a non-list variable.
+
+  if (Elements.size() == 1)
+    return VarListElementInit::get(const_cast<TypedInit *>(this), Elements[0]);
+
+  std::vector<Init*> ListInits;
+  ListInits.reserve(Elements.size());
+  for (unsigned i = 0, e = Elements.size(); i != e; ++i)
+    ListInits.push_back(VarListElementInit::get(const_cast<TypedInit *>(this),
+                                                Elements[i]));
+  return ListInit::get(ListInits, T);
+}
+
+
+VarInit *VarInit::get(const std::string &VN, RecTy *T) {
+  typedef std::pair<RecTy *, TableGenStringKey> Key;
+  typedef DenseMap<Key, VarInit *> Pool;
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(T, VN));
+
+  VarInit *&I = ThePool[TheKey];
+  if (!I) I = new VarInit(VN, T);
+  return I;
+}
+
+Init *VarInit::resolveBitReference(Record &R, const RecordVal *IRV,
+                                   unsigned Bit) const {
+  if (R.isTemplateArg(getName())) return 0;
+  if (IRV && IRV->getName() != getName()) return 0;
+
+  RecordVal *RV = R.getValue(getName());
+  assert(RV && "Reference to a non-existent variable?");
+  assert(dynamic_cast<BitsInit*>(RV->getValue()));
+  BitsInit *BI = (BitsInit*)RV->getValue();
+
+  assert(Bit < BI->getNumBits() && "Bit reference out of range!");
+  Init *B = BI->getBit(Bit);
+
+  // If the bit is set to some value, or if we are resolving a reference to a
+  // specific variable and that variable is explicitly unset, then replace the
+  // VarBitInit with it.
+  if (IRV || !dynamic_cast<UnsetInit*>(B))
+    return B;
+  return 0;
+}
+
+Init *VarInit::resolveListElementReference(Record &R,
+                                           const RecordVal *IRV,
+                                           unsigned Elt) const {
+  if (R.isTemplateArg(getName())) return 0;
+  if (IRV && IRV->getName() != getName()) return 0;
+
+  RecordVal *RV = R.getValue(getName());
+  assert(RV && "Reference to a non-existent variable?");
+  ListInit *LI = dynamic_cast<ListInit*>(RV->getValue());
+  if (!LI) {
+    TypedInit *VI = dynamic_cast<TypedInit*>(RV->getValue());
+    assert(VI && "Invalid list element!");
+    return VarListElementInit::get(VI, Elt);
+  }
+
+  if (Elt >= LI->getSize())
+    return 0;  // Out of range reference.
+  Init *E = LI->getElement(Elt);
+  // If the element is set to some value, or if we are resolving a reference
+  // to a specific variable and that variable is explicitly unset, then
+  // replace the VarListElementInit with it.
+  if (IRV || !dynamic_cast<UnsetInit*>(E))
+    return E;
+  return 0;
+}
+
+
+RecTy *VarInit::getFieldType(const std::string &FieldName) const {
+  if (RecordRecTy *RTy = dynamic_cast<RecordRecTy*>(getType()))
+    if (const RecordVal *RV = RTy->getRecord()->getValue(FieldName))
+      return RV->getType();
+  return 0;
+}
+
+Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
+                            const std::string &FieldName) const {
+  if (dynamic_cast<RecordRecTy*>(getType()))
+    if (const RecordVal *Val = R.getValue(VarName)) {
+      if (RV != Val && (RV || dynamic_cast<UnsetInit*>(Val->getValue())))
+        return 0;
+      Init *TheInit = Val->getValue();
+      assert(TheInit != this && "Infinite loop detected!");
+      if (Init *I = TheInit->getFieldInit(R, RV, FieldName))
+        return I;
+      else
+        return 0;
+    }
+  return 0;
+}
+
+/// resolveReferences - This method is used by classes that refer to other
+/// variables which may not be defined at the time the expression is formed.
+/// If a value is set for the variable later, this method will be called on
+/// users of the value to allow the value to propagate out.
+///
+Init *VarInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  if (RecordVal *Val = R.getValue(VarName))
+    if (RV == Val || (RV == 0 && !dynamic_cast<UnsetInit*>(Val->getValue())))
+      return Val->getValue();
+  return const_cast<VarInit *>(this);
+}
+
+VarBitInit *VarBitInit::get(TypedInit *T, unsigned B) {
+  typedef std::pair<TypedInit *, unsigned> Key;
+  typedef DenseMap<Key, VarBitInit *> Pool;
+
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(T, B));
+
+  VarBitInit *&I = ThePool[TheKey];
+  if (!I) I = new VarBitInit(T, B);
+  return I;
+}
+
+std::string VarBitInit::getAsString() const {
+   return TI->getAsString() + "{" + utostr(Bit) + "}";
+}
+
+Init *VarBitInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  if (Init *I = getVariable()->resolveBitReference(R, RV, getBitNum()))
+    return I;
+  return const_cast<VarBitInit *>(this);
+}
+
+VarListElementInit *VarListElementInit::get(TypedInit *T,
+                                            unsigned E) {
+  typedef std::pair<TypedInit *, unsigned> Key;
+  typedef DenseMap<Key, VarListElementInit *> Pool;
+
+  static Pool ThePool;
+
+  Key TheKey(std::make_pair(T, E));
+
+  VarListElementInit *&I = ThePool[TheKey];
+  if (!I) I = new VarListElementInit(T, E);
+  return I;
+}
+
+std::string VarListElementInit::getAsString() const {
+  return TI->getAsString() + "[" + utostr(Element) + "]";
+}
+
+Init *
+VarListElementInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  if (Init *I = getVariable()->resolveListElementReference(R, RV,
+                                                           getElementNum()))
+    return I;
+  return const_cast<VarListElementInit *>(this);
+}
+
+Init *VarListElementInit::resolveBitReference(Record &R, const RecordVal *RV,
+                                              unsigned Bit) const {
+  // FIXME: This should be implemented, to support references like:
+  // bit B = AA[0]{1};
+  return 0;
+}
+
+Init *VarListElementInit:: resolveListElementReference(Record &R,
+                                                       const RecordVal *RV,
+                                                       unsigned Elt) const {
+  Init *Result = TI->resolveListElementReference(R, RV, Element);
+  
+  if (Result) {
+    TypedInit *TInit = dynamic_cast<TypedInit *>(Result);
+    if (TInit) {
+      Init *Result2 = TInit->resolveListElementReference(R, RV, Elt);
+      if (Result2) return Result2;
+      return new VarListElementInit(TInit, Elt);
+    }
+    return Result;
+  }
+ 
+  return 0;
+}
+
+DefInit *DefInit::get(Record *R) {
+  return R->getDefInit();
+}
+
+RecTy *DefInit::getFieldType(const std::string &FieldName) const {
+  if (const RecordVal *RV = Def->getValue(FieldName))
+    return RV->getType();
+  return 0;
+}
+
+Init *DefInit::getFieldInit(Record &R, const RecordVal *RV,
+                            const std::string &FieldName) const {
+  return Def->getValue(FieldName)->getValue();
+}
+
+
+std::string DefInit::getAsString() const {
+  return Def->getName();
+}
+
+FieldInit *FieldInit::get(Init *R, const std::string &FN) {
+  typedef std::pair<Init *, TableGenStringKey> Key;
+  typedef DenseMap<Key, FieldInit *> Pool;
+  static Pool ThePool;  
+
+  Key TheKey(std::make_pair(R, FN));
+
+  FieldInit *&I = ThePool[TheKey];
+  if (!I) I = new FieldInit(R, FN);
+  return I;
+}
+
+Init *FieldInit::resolveBitReference(Record &R, const RecordVal *RV,
+                                     unsigned Bit) const {
+  if (Init *BitsVal = Rec->getFieldInit(R, RV, FieldName))
+    if (BitsInit *BI = dynamic_cast<BitsInit*>(BitsVal)) {
+      assert(Bit < BI->getNumBits() && "Bit reference out of range!");
+      Init *B = BI->getBit(Bit);
+
+      if (dynamic_cast<BitInit*>(B))  // If the bit is set.
+        return B;                     // Replace the VarBitInit with it.
+    }
+  return 0;
+}
+
+Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
+                                             unsigned Elt) const {
+  if (Init *ListVal = Rec->getFieldInit(R, RV, FieldName))
+    if (ListInit *LI = dynamic_cast<ListInit*>(ListVal)) {
+      if (Elt >= LI->getSize()) return 0;
+      Init *E = LI->getElement(Elt);
+
+      // If the element is set to some value, or if we are resolving a
+      // reference to a specific variable and that variable is explicitly
+      // unset, then replace the VarListElementInit with it.
+      if (RV || !dynamic_cast<UnsetInit*>(E))
+        return E;
+    }
+  return 0;
+}
+
+Init *FieldInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  Init *NewRec = RV ? Rec->resolveReferences(R, RV) : Rec;
+
+  Init *BitsVal = NewRec->getFieldInit(R, RV, FieldName);
+  if (BitsVal) {
+    Init *BVR = BitsVal->resolveReferences(R, RV);
+    return BVR->isComplete() ? BVR : const_cast<FieldInit *>(this);
+  }
+
+  if (NewRec != Rec) {
+    return FieldInit::get(NewRec, FieldName);
+  }
+  return const_cast<FieldInit *>(this);
+}
+
+void ProfileDagInit(FoldingSetNodeID &ID,
+                    Init *V,
+                    const std::string &VN,
+                    ArrayRef<Init *> ArgRange,
+                    ArrayRef<std::string> NameRange) {
+  ID.AddPointer(V);
+  ID.AddString(VN);
+
+  ArrayRef<Init *>::iterator Arg  = ArgRange.begin();
+  ArrayRef<std::string>::iterator  Name = NameRange.begin();
+  while (Arg != ArgRange.end()) {
+    assert(Name != NameRange.end() && "Arg name underflow!");
+    ID.AddPointer(*Arg++);
+    ID.AddString(*Name++);
+  }
+  assert(Name == NameRange.end() && "Arg name overflow!");
+}
+
+DagInit *
+DagInit::get(Init *V, const std::string &VN,
+             ArrayRef<Init *> ArgRange,
+             ArrayRef<std::string> NameRange) {
+  typedef FoldingSet<DagInit> Pool;
+  static Pool ThePool;  
+
+  FoldingSetNodeID ID;
+  ProfileDagInit(ID, V, VN, ArgRange, NameRange);
+
+  void *IP = 0;
+  if (DagInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
+    return I;
+
+  DagInit *I = new DagInit(V, VN, ArgRange, NameRange);
+  ThePool.InsertNode(I, IP);
+
+  return I;
+}
+
+DagInit *
+DagInit::get(Init *V, const std::string &VN,
+             const std::vector<std::pair<Init*, std::string> > &args) {
+  typedef std::pair<Init*, std::string> PairType;
+
+  std::vector<Init *> Args;
+  std::vector<std::string> Names;
+
+  for (std::vector<PairType>::const_iterator i = args.begin(),
+         iend = args.end();
+       i != iend;
+       ++i) {
+    Args.push_back(i->first);
+    Names.push_back(i->second);
+  }
+
+  return DagInit::get(V, VN, Args, Names);
+}
+
+void DagInit::Profile(FoldingSetNodeID &ID) const {
+  ProfileDagInit(ID, Val, ValName, Args, ArgNames);
+}
+
+Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
+  std::vector<Init*> NewArgs;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i)
+    NewArgs.push_back(Args[i]->resolveReferences(R, RV));
+
+  Init *Op = Val->resolveReferences(R, RV);
+
+  if (Args != NewArgs || Op != Val)
+    return DagInit::get(Op, ValName, NewArgs, ArgNames);
+
+  return const_cast<DagInit *>(this);
+}
+
+
+std::string DagInit::getAsString() const {
+  std::string Result = "(" + Val->getAsString();
+  if (!ValName.empty())
+    Result += ":" + ValName;
+  if (Args.size()) {
+    Result += " " + Args[0]->getAsString();
+    if (!ArgNames[0].empty()) Result += ":$" + ArgNames[0];
+    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
+      Result += ", " + Args[i]->getAsString();
+      if (!ArgNames[i].empty()) Result += ":$" + ArgNames[i];
+    }
+  }
+  return Result + ")";
+}
+
+
+//===----------------------------------------------------------------------===//
+//    Other implementations
+//===----------------------------------------------------------------------===//
+
+RecordVal::RecordVal(Init *N, RecTy *T, unsigned P)
+  : Name(N), Ty(T), Prefix(P) {
+  Value = Ty->convertValue(UnsetInit::get());
+  assert(Value && "Cannot create unset value for current type!");
+}
+
+RecordVal::RecordVal(const std::string &N, RecTy *T, unsigned P)
+  : Name(StringInit::get(N)), Ty(T), Prefix(P) {
+  Value = Ty->convertValue(UnsetInit::get());
+  assert(Value && "Cannot create unset value for current type!");
+}
+
+const std::string &RecordVal::getName() const {
+  StringInit *NameString = dynamic_cast<StringInit *>(Name);
+  assert(NameString && "RecordVal name is not a string!");
+  return NameString->getValue();
+}
+
+void RecordVal::dump() const { errs() << *this; }
+
+void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
+  if (getPrefix()) OS << "field ";
+  OS << *getType() << " " << getName();
+
+  if (getValue())
+    OS << " = " << *getValue();
+
+  if (PrintSem) OS << ";\n";
+}
+
+unsigned Record::LastID = 0;
+
+void Record::checkName() {
+  // Ensure the record name has string type.
+  const TypedInit *TypedName = dynamic_cast<const TypedInit *>(Name);
+  assert(TypedName && "Record name is not typed!");
+  RecTy *Type = TypedName->getType();
+  if (dynamic_cast<StringRecTy *>(Type) == 0) {
+    llvm_unreachable("Record name is not a string!");
+  }
+}
+
+DefInit *Record::getDefInit() {
+  if (!TheInit)
+    TheInit = new DefInit(this, new RecordRecTy(this));
+  return TheInit;
+}
+
+const std::string &Record::getName() const {
+  const StringInit *NameString =
+    dynamic_cast<const StringInit *>(Name);
+  assert(NameString && "Record name is not a string!");
+  return NameString->getValue();
+}
+
+void Record::setName(Init *NewName) {
+  if (TrackedRecords.getDef(Name->getAsUnquotedString()) == this) {
+    TrackedRecords.removeDef(Name->getAsUnquotedString());
+    Name = NewName;
+    TrackedRecords.addDef(this);
+  } else {
+    TrackedRecords.removeClass(Name->getAsUnquotedString());
+    Name = NewName;
+    TrackedRecords.addClass(this);
+  }
+  checkName();
+  // Since the Init for the name was changed, see if we can resolve
+  // any of it using members of the Record.
+  Init *ComputedName = Name->resolveReferences(*this, 0);
+  if (ComputedName != Name) {
+    setName(ComputedName);
+  }
+  // DO NOT resolve record values to the name at this point because
+  // there might be default values for arguments of this def.  Those
+  // arguments might not have been resolved yet so we don't want to
+  // prematurely assume values for those arguments were not passed to
+  // this def.
+  //
+  // Nonetheless, it may be that some of this Record's values
+  // reference the record name.  Indeed, the reason for having the
+  // record name be an Init is to provide this flexibility.  The extra
+  // resolve steps after completely instantiating defs takes care of
+  // this.  See TGParser::ParseDef and TGParser::ParseDefm.
+}
+
+void Record::setName(const std::string &Name) {
+  setName(StringInit::get(Name));
+}
+
+/// resolveReferencesTo - If anything in this record refers to RV, replace the
+/// reference to RV with the RHS of RV.  If RV is null, we resolve all possible
+/// references.
+void Record::resolveReferencesTo(const RecordVal *RV) {
+  for (unsigned i = 0, e = Values.size(); i != e; ++i) {
+    if (Init *V = Values[i].getValue())
+      Values[i].setValue(V->resolveReferences(*this, RV));
+  }
+}
+
+void Record::dump() const { errs() << *this; }
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
+  OS << R.getName();
+
+  const std::vector<std::string> &TArgs = R.getTemplateArgs();
+  if (!TArgs.empty()) {
+    OS << "<";
+    for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+      if (i) OS << ", ";
+      const RecordVal *RV = R.getValue(TArgs[i]);
+      assert(RV && "Template argument record not found??");
+      RV->print(OS, false);
+    }
+    OS << ">";
+  }
+
+  OS << " {";
+  const std::vector<Record*> &SC = R.getSuperClasses();
+  if (!SC.empty()) {
+    OS << "\t//";
+    for (unsigned i = 0, e = SC.size(); i != e; ++i)
+      OS << " " << SC[i]->getName();
+  }
+  OS << "\n";
+
+  const std::vector<RecordVal> &Vals = R.getValues();
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
+    if (Vals[i].getPrefix() && !R.isTemplateArg(Vals[i].getName()))
+      OS << Vals[i];
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
+    if (!Vals[i].getPrefix() && !R.isTemplateArg(Vals[i].getName()))
+      OS << Vals[i];
+
+  return OS << "}\n";
+}
+
+/// getValueInit - Return the initializer for a value with the specified name,
+/// or throw an exception if the field does not exist.
+///
+Init *Record::getValueInit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+      FieldName.str() + "'!\n";
+  return R->getValue();
+}
+
+
+/// getValueAsString - This method looks up the specified field and returns its
+/// value as a string, throwing an exception if the field does not exist or if
+/// the value is not a string.
+///
+std::string Record::getValueAsString(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+          FieldName.str() + "'!\n";
+
+  if (StringInit *SI = dynamic_cast<StringInit*>(R->getValue()))
+    return SI->getValue();
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have a string initializer!";
+}
+
+/// getValueAsBitsInit - This method looks up the specified field and returns
+/// its value as a BitsInit, throwing an exception if the field does not exist
+/// or if the value is not the right type.
+///
+BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+          FieldName.str() + "'!\n";
+
+  if (BitsInit *BI = dynamic_cast<BitsInit*>(R->getValue()))
+    return BI;
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have a BitsInit initializer!";
+}
+
+/// getValueAsListInit - This method looks up the specified field and returns
+/// its value as a ListInit, throwing an exception if the field does not exist
+/// or if the value is not the right type.
+///
+ListInit *Record::getValueAsListInit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+          FieldName.str() + "'!\n";
+
+  if (ListInit *LI = dynamic_cast<ListInit*>(R->getValue()))
+    return LI;
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have a list initializer!";
+}
+
+/// getValueAsListOfDefs - This method looks up the specified field and returns
+/// its value as a vector of records, throwing an exception if the field does
+/// not exist or if the value is not the right type.
+///
+std::vector<Record*>
+Record::getValueAsListOfDefs(StringRef FieldName) const {
+  ListInit *List = getValueAsListInit(FieldName);
+  std::vector<Record*> Defs;
+  for (unsigned i = 0; i < List->getSize(); i++) {
+    if (DefInit *DI = dynamic_cast<DefInit*>(List->getElement(i))) {
+      Defs.push_back(DI->getDef());
+    } else {
+      throw "Record `" + getName() + "', field `" + FieldName.str() +
+            "' list is not entirely DefInit!";
+    }
+  }
+  return Defs;
+}
+
+/// getValueAsInt - This method looks up the specified field and returns its
+/// value as an int64_t, throwing an exception if the field does not exist or if
+/// the value is not the right type.
+///
+int64_t Record::getValueAsInt(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+          FieldName.str() + "'!\n";
+
+  if (IntInit *II = dynamic_cast<IntInit*>(R->getValue()))
+    return II->getValue();
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have an int initializer!";
+}
+
+/// getValueAsListOfInts - This method looks up the specified field and returns
+/// its value as a vector of integers, throwing an exception if the field does
+/// not exist or if the value is not the right type.
+///
+std::vector<int64_t>
+Record::getValueAsListOfInts(StringRef FieldName) const {
+  ListInit *List = getValueAsListInit(FieldName);
+  std::vector<int64_t> Ints;
+  for (unsigned i = 0; i < List->getSize(); i++) {
+    if (IntInit *II = dynamic_cast<IntInit*>(List->getElement(i))) {
+      Ints.push_back(II->getValue());
+    } else {
+      throw "Record `" + getName() + "', field `" + FieldName.str() +
+            "' does not have a list of ints initializer!";
+    }
+  }
+  return Ints;
+}
+
+/// getValueAsListOfStrings - This method looks up the specified field and
+/// returns its value as a vector of strings, throwing an exception if the
+/// field does not exist or if the value is not the right type.
+///
+std::vector<std::string>
+Record::getValueAsListOfStrings(StringRef FieldName) const {
+  ListInit *List = getValueAsListInit(FieldName);
+  std::vector<std::string> Strings;
+  for (unsigned i = 0; i < List->getSize(); i++) {
+    if (StringInit *II = dynamic_cast<StringInit*>(List->getElement(i))) {
+      Strings.push_back(II->getValue());
+    } else {
+      throw "Record `" + getName() + "', field `" + FieldName.str() +
+            "' does not have a list of strings initializer!";
+    }
+  }
+  return Strings;
+}
+
+/// getValueAsDef - This method looks up the specified field and returns its
+/// value as a Record, throwing an exception if the field does not exist or if
+/// the value is not the right type.
+///
+Record *Record::getValueAsDef(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+      FieldName.str() + "'!\n";
+
+  if (DefInit *DI = dynamic_cast<DefInit*>(R->getValue()))
+    return DI->getDef();
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have a def initializer!";
+}
+
+/// getValueAsBit - This method looks up the specified field and returns its
+/// value as a bit, throwing an exception if the field does not exist or if
+/// the value is not the right type.
+///
+bool Record::getValueAsBit(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+      FieldName.str() + "'!\n";
+
+  if (BitInit *BI = dynamic_cast<BitInit*>(R->getValue()))
+    return BI->getValue();
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have a bit initializer!";
+}
+
+/// getValueAsDag - This method looks up the specified field and returns its
+/// value as an Dag, throwing an exception if the field does not exist or if
+/// the value is not the right type.
+///
+DagInit *Record::getValueAsDag(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+      FieldName.str() + "'!\n";
+
+  if (DagInit *DI = dynamic_cast<DagInit*>(R->getValue()))
+    return DI;
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have a dag initializer!";
+}
+
+std::string Record::getValueAsCode(StringRef FieldName) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+      FieldName.str() + "'!\n";
+
+  if (CodeInit *CI = dynamic_cast<CodeInit*>(R->getValue()))
+    return CI->getValue();
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+    "' does not have a code initializer!";
+}
+
+
+void MultiClass::dump() const {
+  errs() << "Record:\n";
+  Rec.dump();
+
+  errs() << "Defs:\n";
+  for (RecordVector::const_iterator r = DefPrototypes.begin(),
+         rend = DefPrototypes.end();
+       r != rend;
+       ++r) {
+    (*r)->dump();
+  }
+}
+
+
+void RecordKeeper::dump() const { errs() << *this; }
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const RecordKeeper &RK) {
+  OS << "------------- Classes -----------------\n";
+  const std::map<std::string, Record*> &Classes = RK.getClasses();
+  for (std::map<std::string, Record*>::const_iterator I = Classes.begin(),
+         E = Classes.end(); I != E; ++I)
+    OS << "class " << *I->second;
+
+  OS << "------------- Defs -----------------\n";
+  const std::map<std::string, Record*> &Defs = RK.getDefs();
+  for (std::map<std::string, Record*>::const_iterator I = Defs.begin(),
+         E = Defs.end(); I != E; ++I)
+    OS << "def " << *I->second;
+  return OS;
+}
+
+
+/// getAllDerivedDefinitions - This method returns all concrete definitions
+/// that derive from the specified class name.  If a class with the specified
+/// name does not exist, an error is printed and true is returned.
+std::vector<Record*>
+RecordKeeper::getAllDerivedDefinitions(const std::string &ClassName) const {
+  Record *Class = getClass(ClassName);
+  if (!Class)
+    throw "ERROR: Couldn't find the `" + ClassName + "' class!\n";
+
+  std::vector<Record*> Defs;
+  for (std::map<std::string, Record*>::const_iterator I = getDefs().begin(),
+         E = getDefs().end(); I != E; ++I)
+    if (I->second->isSubClassOf(Class))
+      Defs.push_back(I->second);
+
+  return Defs;
+}
+
diff --git a/lib/TableGen/TGLexer.cpp b/lib/TableGen/TGLexer.cpp
new file mode 100644
index 0000000..8c1b429
--- /dev/null
+++ b/lib/TableGen/TGLexer.cpp
@@ -0,0 +1,435 @@
+//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the Lexer for TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TGLexer.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Config/config.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cerrno>
+using namespace llvm;
+
+TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
+  CurBuffer = 0;
+  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
+  CurPtr = CurBuf->getBufferStart();
+  TokStart = 0;
+}
+
+SMLoc TGLexer::getLoc() const {
+  return SMLoc::getFromPointer(TokStart);
+}
+
+/// ReturnError - Set the error to the specified string at the specified
+/// location.  This is defined to always return tgtok::Error.
+tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
+  PrintError(Loc, Msg);
+  return tgtok::Error;
+}
+
+int TGLexer::getNextChar() {
+  char CurChar = *CurPtr++;
+  switch (CurChar) {
+  default:
+    return (unsigned char)CurChar;
+  case 0: {
+    // A nul character in the stream is either the end of the current buffer or
+    // a random nul in the file.  Disambiguate that here.
+    if (CurPtr-1 != CurBuf->getBufferEnd())
+      return 0;  // Just whitespace.
+    
+    // If this is the end of an included file, pop the parent file off the
+    // include stack.
+    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
+    if (ParentIncludeLoc != SMLoc()) {
+      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
+      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
+      CurPtr = ParentIncludeLoc.getPointer();
+      return getNextChar();
+    }
+    
+    // Otherwise, return end of file.
+    --CurPtr;  // Another call to lex will return EOF again.  
+    return EOF;
+  }
+  case '\n':
+  case '\r':
+    // Handle the newline character by ignoring it and incrementing the line
+    // count.  However, be careful about 'dos style' files with \n\r in them.
+    // Only treat a \n\r or \r\n as a single line.
+    if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
+        *CurPtr != CurChar)
+      ++CurPtr;  // Eat the two char newline sequence.
+    return '\n';
+  }  
+}
+
+tgtok::TokKind TGLexer::LexToken() {
+  TokStart = CurPtr;
+  // This always consumes at least one character.
+  int CurChar = getNextChar();
+
+  switch (CurChar) {
+  default:
+    // Handle letters: [a-zA-Z_#]
+    if (isalpha(CurChar) || CurChar == '_' || CurChar == '#')
+      return LexIdentifier();
+      
+    // Unknown character, emit an error.
+    return ReturnError(TokStart, "Unexpected character");
+  case EOF: return tgtok::Eof;
+  case ':': return tgtok::colon;
+  case ';': return tgtok::semi;
+  case '.': return tgtok::period;
+  case ',': return tgtok::comma;
+  case '<': return tgtok::less;
+  case '>': return tgtok::greater;
+  case ']': return tgtok::r_square;
+  case '{': return tgtok::l_brace;
+  case '}': return tgtok::r_brace;
+  case '(': return tgtok::l_paren;
+  case ')': return tgtok::r_paren;
+  case '=': return tgtok::equal;
+  case '?': return tgtok::question;
+      
+  case 0:
+  case ' ':
+  case '\t':
+  case '\n':
+  case '\r':
+    // Ignore whitespace.
+    return LexToken();
+  case '/':
+    // If this is the start of a // comment, skip until the end of the line or
+    // the end of the buffer.
+    if (*CurPtr == '/')
+      SkipBCPLComment();
+    else if (*CurPtr == '*') {
+      if (SkipCComment())
+        return tgtok::Error;
+    } else // Otherwise, this is an error.
+      return ReturnError(TokStart, "Unexpected character");
+    return LexToken();
+  case '-': case '+':
+  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
+  case '7': case '8': case '9':  
+    return LexNumber();
+  case '"': return LexString();
+  case '$': return LexVarName();
+  case '[': return LexBracket();
+  case '!': return LexExclaim();
+  }
+}
+
+/// LexString - Lex "[^"]*"
+tgtok::TokKind TGLexer::LexString() {
+  const char *StrStart = CurPtr;
+  
+  CurStrVal = "";
+  
+  while (*CurPtr != '"') {
+    // If we hit the end of the buffer, report an error.
+    if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
+      return ReturnError(StrStart, "End of file in string literal");
+    
+    if (*CurPtr == '\n' || *CurPtr == '\r')
+      return ReturnError(StrStart, "End of line in string literal");
+    
+    if (*CurPtr != '\\') {
+      CurStrVal += *CurPtr++;
+      continue;
+    }
+
+    ++CurPtr;
+    
+    switch (*CurPtr) {
+    case '\\': case '\'': case '"':
+      // These turn into their literal character.
+      CurStrVal += *CurPtr++;
+      break;
+    case 't':
+      CurStrVal += '\t';
+      ++CurPtr;
+      break;
+    case 'n':
+      CurStrVal += '\n';
+      ++CurPtr;
+      break;
+        
+    case '\n':
+    case '\r':
+      return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
+
+    // If we hit the end of the buffer, report an error.
+    case '\0':
+      if (CurPtr == CurBuf->getBufferEnd())
+        return ReturnError(StrStart, "End of file in string literal");
+      // FALL THROUGH
+    default:
+      return ReturnError(CurPtr, "invalid escape in string literal");
+    }
+  }
+  
+  ++CurPtr;
+  return tgtok::StrVal;
+}
+
+tgtok::TokKind TGLexer::LexVarName() {
+  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
+    return ReturnError(TokStart, "Invalid variable name");
+  
+  // Otherwise, we're ok, consume the rest of the characters.
+  const char *VarNameStart = CurPtr++;
+  
+  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
+    ++CurPtr;
+
+  CurStrVal.assign(VarNameStart, CurPtr);
+  return tgtok::VarName;
+}
+
+
+tgtok::TokKind TGLexer::LexIdentifier() {
+  // The first letter is [a-zA-Z_#].
+  const char *IdentStart = TokStart;
+
+  // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
+  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' ||
+         *CurPtr == '#')
+    ++CurPtr;
+
+  // Check to see if this identifier is a keyword.
+  StringRef Str(IdentStart, CurPtr-IdentStart);
+
+  if (Str == "include") {
+    if (LexInclude()) return tgtok::Error;
+    return Lex();
+  }
+
+  tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
+    .Case("int", tgtok::Int)
+    .Case("bit", tgtok::Bit)
+    .Case("bits", tgtok::Bits)
+    .Case("string", tgtok::String)
+    .Case("list", tgtok::List)
+    .Case("code", tgtok::Code)
+    .Case("dag", tgtok::Dag)
+    .Case("class", tgtok::Class)
+    .Case("def", tgtok::Def)
+    .Case("defm", tgtok::Defm)
+    .Case("multiclass", tgtok::MultiClass)
+    .Case("field", tgtok::Field)
+    .Case("let", tgtok::Let)
+    .Case("in", tgtok::In)
+    .Default(tgtok::Id);
+
+  if (Kind == tgtok::Id)
+    CurStrVal.assign(Str.begin(), Str.end());
+  return Kind;
+}
+
+/// LexInclude - We just read the "include" token.  Get the string token that
+/// comes next and enter the include.
+bool TGLexer::LexInclude() {
+  // The token after the include must be a string.
+  tgtok::TokKind Tok = LexToken();
+  if (Tok == tgtok::Error) return true;
+  if (Tok != tgtok::StrVal) {
+    PrintError(getLoc(), "Expected filename after include");
+    return true;
+  }
+
+  // Get the string.
+  std::string Filename = CurStrVal;
+  std::string IncludedFile;
+
+  
+  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
+                                    IncludedFile);
+  if (CurBuffer == -1) {
+    PrintError(getLoc(), "Could not find include file '" + Filename + "'");
+    return true;
+  }
+  
+  Dependencies.push_back(IncludedFile);
+  // Save the line number and lex buffer of the includer.
+  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
+  CurPtr = CurBuf->getBufferStart();
+  return false;
+}
+
+void TGLexer::SkipBCPLComment() {
+  ++CurPtr;  // skip the second slash.
+  while (1) {
+    switch (*CurPtr) {
+    case '\n':
+    case '\r':
+      return;  // Newline is end of comment.
+    case 0:
+      // If this is the end of the buffer, end the comment.
+      if (CurPtr == CurBuf->getBufferEnd())
+        return;
+      break;
+    }
+    // Otherwise, skip the character.
+    ++CurPtr;
+  }
+}
+
+/// SkipCComment - This skips C-style /**/ comments.  The only difference from C
+/// is that we allow nesting.
+bool TGLexer::SkipCComment() {
+  ++CurPtr;  // skip the star.
+  unsigned CommentDepth = 1;
+  
+  while (1) {
+    int CurChar = getNextChar();
+    switch (CurChar) {
+    case EOF:
+      PrintError(TokStart, "Unterminated comment!");
+      return true;
+    case '*':
+      // End of the comment?
+      if (CurPtr[0] != '/') break;
+      
+      ++CurPtr;   // End the */.
+      if (--CommentDepth == 0)
+        return false;
+      break;
+    case '/':
+      // Start of a nested comment?
+      if (CurPtr[0] != '*') break;
+      ++CurPtr;
+      ++CommentDepth;
+      break;
+    }
+  }
+}
+
+/// LexNumber - Lex:
+///    [-+]?[0-9]+
+///    0x[0-9a-fA-F]+
+///    0b[01]+
+tgtok::TokKind TGLexer::LexNumber() {
+  if (CurPtr[-1] == '0') {
+    if (CurPtr[0] == 'x') {
+      ++CurPtr;
+      const char *NumStart = CurPtr;
+      while (isxdigit(CurPtr[0]))
+        ++CurPtr;
+      
+      // Requires at least one hex digit.
+      if (CurPtr == NumStart)
+        return ReturnError(TokStart, "Invalid hexadecimal number");
+
+      errno = 0;
+      CurIntVal = strtoll(NumStart, 0, 16);
+      if (errno == EINVAL)
+        return ReturnError(TokStart, "Invalid hexadecimal number");
+      if (errno == ERANGE) {
+        errno = 0;
+        CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
+        if (errno == EINVAL)
+          return ReturnError(TokStart, "Invalid hexadecimal number");
+        if (errno == ERANGE)
+          return ReturnError(TokStart, "Hexadecimal number out of range");
+      }
+      return tgtok::IntVal;
+    } else if (CurPtr[0] == 'b') {
+      ++CurPtr;
+      const char *NumStart = CurPtr;
+      while (CurPtr[0] == '0' || CurPtr[0] == '1')
+        ++CurPtr;
+
+      // Requires at least one binary digit.
+      if (CurPtr == NumStart)
+        return ReturnError(CurPtr-2, "Invalid binary number");
+      CurIntVal = strtoll(NumStart, 0, 2);
+      return tgtok::IntVal;
+    }
+  }
+
+  // Check for a sign without a digit.
+  if (!isdigit(CurPtr[0])) {
+    if (CurPtr[-1] == '-')
+      return tgtok::minus;
+    else if (CurPtr[-1] == '+')
+      return tgtok::plus;
+  }
+  
+  while (isdigit(CurPtr[0]))
+    ++CurPtr;
+  CurIntVal = strtoll(TokStart, 0, 10);
+  return tgtok::IntVal;
+}
+
+/// LexBracket - We just read '['.  If this is a code block, return it,
+/// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
+tgtok::TokKind TGLexer::LexBracket() {
+  if (CurPtr[0] != '{')
+    return tgtok::l_square;
+  ++CurPtr;
+  const char *CodeStart = CurPtr;
+  while (1) {
+    int Char = getNextChar();
+    if (Char == EOF) break;
+    
+    if (Char != '}') continue;
+    
+    Char = getNextChar();
+    if (Char == EOF) break;
+    if (Char == ']') {
+      CurStrVal.assign(CodeStart, CurPtr-2);
+      return tgtok::CodeFragment;
+    }
+  }
+  
+  return ReturnError(CodeStart-2, "Unterminated Code Block");
+}
+
+/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
+tgtok::TokKind TGLexer::LexExclaim() {
+  if (!isalpha(*CurPtr))
+    return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
+  
+  const char *Start = CurPtr++;
+  while (isalpha(*CurPtr))
+    ++CurPtr;
+  
+  // Check to see which operator this is.
+  tgtok::TokKind Kind =
+    StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
+    .Case("eq", tgtok::XEq)
+    .Case("if", tgtok::XIf)
+    .Case("head", tgtok::XHead)
+    .Case("tail", tgtok::XTail)
+    .Case("con", tgtok::XConcat)
+    .Case("shl", tgtok::XSHL)
+    .Case("sra", tgtok::XSRA)
+    .Case("srl", tgtok::XSRL)
+    .Case("cast", tgtok::XCast)
+    .Case("empty", tgtok::XEmpty)
+    .Case("subst", tgtok::XSubst)
+    .Case("foreach", tgtok::XForEach)
+    .Case("strconcat", tgtok::XStrConcat)
+    .Default(tgtok::Error);
+
+  return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
+}
+
diff --git a/lib/TableGen/TGLexer.h b/lib/TableGen/TGLexer.h
new file mode 100644
index 0000000..84d328b
--- /dev/null
+++ b/lib/TableGen/TGLexer.h
@@ -0,0 +1,125 @@
+//===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents the Lexer for tablegen files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TGLEXER_H
+#define TGLEXER_H
+
+#include "llvm/Support/DataTypes.h"
+#include <string>
+#include <vector>
+#include <cassert>
+
+namespace llvm {
+class MemoryBuffer;
+class SourceMgr;
+class SMLoc;
+class Twine;
+
+namespace tgtok {
+  enum TokKind {
+    // Markers
+    Eof, Error,
+    
+    // Tokens with no info.
+    minus, plus,        // - +
+    l_square, r_square, // [ ]
+    l_brace, r_brace,   // { }
+    l_paren, r_paren,   // ( )
+    less, greater,      // < >
+    colon, semi,        // : ;
+    comma, period,      // , .
+    equal, question,    // = ?
+    
+    // Keywords.
+    Bit, Bits, Class, Code, Dag, Def, Defm, Field, In, Int, Let, List,
+    MultiClass, String,
+    
+    // !keywords.
+    XConcat, XSRA, XSRL, XSHL, XStrConcat, XCast, XSubst,
+    XForEach, XHead, XTail, XEmpty, XIf, XEq,
+
+    // Integer value.
+    IntVal,
+    
+    // String valued tokens.
+    Id, StrVal, VarName, CodeFragment
+  };
+}
+
+/// TGLexer - TableGen Lexer class.
+class TGLexer {
+  SourceMgr &SrcMgr;
+  
+  const char *CurPtr;
+  const MemoryBuffer *CurBuf;
+
+  // Information about the current token.
+  const char *TokStart;
+  tgtok::TokKind CurCode;
+  std::string CurStrVal;  // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT
+  int64_t CurIntVal;      // This is valid for INTVAL.
+
+  /// CurBuffer - This is the current buffer index we're lexing from as managed
+  /// by the SourceMgr object.
+  int CurBuffer;
+  /// Dependencies - This is the list of all included files.
+  std::vector<std::string> Dependencies;
+  
+public:
+  TGLexer(SourceMgr &SrcMgr);
+  ~TGLexer() {}
+  
+  tgtok::TokKind Lex() {
+    return CurCode = LexToken();
+  }
+
+  const std::vector<std::string> &getDependencies() const {
+    return Dependencies;
+  }
+  
+  tgtok::TokKind getCode() const { return CurCode; }
+
+  const std::string &getCurStrVal() const {
+    assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 
+            CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) &&
+           "This token doesn't have a string value");
+    return CurStrVal;
+  }
+  int64_t getCurIntVal() const {
+    assert(CurCode == tgtok::IntVal && "This token isn't an integer");
+    return CurIntVal;
+  }
+
+  SMLoc getLoc() const;
+  
+private:
+  /// LexToken - Read the next token and return its code.
+  tgtok::TokKind LexToken();
+  
+  tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
+  
+  int getNextChar();
+  void SkipBCPLComment();
+  bool SkipCComment();
+  tgtok::TokKind LexIdentifier();
+  bool LexInclude();
+  tgtok::TokKind LexString();
+  tgtok::TokKind LexVarName();
+  tgtok::TokKind LexNumber();
+  tgtok::TokKind LexBracket();
+  tgtok::TokKind LexExclaim();
+};
+  
+} // end namespace llvm
+
+#endif
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
new file mode 100644
index 0000000..e7f00ba
--- /dev/null
+++ b/lib/TableGen/TGParser.cpp
@@ -0,0 +1,2194 @@
+//===- TGParser.cpp - Parser for TableGen Files ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implement the Parser for TableGen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TGParser.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <sstream>
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Support Code for the Semantic Actions.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+struct SubClassReference {
+  SMLoc RefLoc;
+  Record *Rec;
+  std::vector<Init*> TemplateArgs;
+  SubClassReference() : Rec(0) {}
+
+  bool isInvalid() const { return Rec == 0; }
+};
+
+struct SubMultiClassReference {
+  SMLoc RefLoc;
+  MultiClass *MC;
+  std::vector<Init*> TemplateArgs;
+  SubMultiClassReference() : MC(0) {}
+
+  bool isInvalid() const { return MC == 0; }
+  void dump() const;
+};
+
+void SubMultiClassReference::dump() const {
+  errs() << "Multiclass:\n";
+
+  MC->dump();
+
+  errs() << "Template args:\n";
+  for (std::vector<Init *>::const_iterator i = TemplateArgs.begin(),
+         iend = TemplateArgs.end();
+       i != iend;
+       ++i) {
+    (*i)->dump();
+  }
+}
+
+} // end namespace llvm
+
+bool TGParser::AddValue(Record *CurRec, SMLoc Loc, const RecordVal &RV) {
+  if (CurRec == 0)
+    CurRec = &CurMultiClass->Rec;
+
+  if (RecordVal *ERV = CurRec->getValue(RV.getName())) {
+    // The value already exists in the class, treat this as a set.
+    if (ERV->setValue(RV.getValue()))
+      return Error(Loc, "New definition of '" + RV.getName() + "' of type '" +
+                   RV.getType()->getAsString() + "' is incompatible with " +
+                   "previous definition of type '" +
+                   ERV->getType()->getAsString() + "'");
+  } else {
+    CurRec->addValue(RV);
+  }
+  return false;
+}
+
+/// SetValue -
+/// Return true on error, false on success.
+bool TGParser::SetValue(Record *CurRec, SMLoc Loc, const std::string &ValName,
+                        const std::vector<unsigned> &BitList, Init *V) {
+  if (!V) return false;
+
+  if (CurRec == 0) CurRec = &CurMultiClass->Rec;
+
+  RecordVal *RV = CurRec->getValue(ValName);
+  if (RV == 0)
+    return Error(Loc, "Value '" + ValName + "' unknown!");
+
+  // Do not allow assignments like 'X = X'.  This will just cause infinite loops
+  // in the resolution machinery.
+  if (BitList.empty())
+    if (VarInit *VI = dynamic_cast<VarInit*>(V))
+      if (VI->getName() == ValName)
+        return false;
+
+  // If we are assigning to a subset of the bits in the value... then we must be
+  // assigning to a field of BitsRecTy, which must have a BitsInit
+  // initializer.
+  //
+  if (!BitList.empty()) {
+    BitsInit *CurVal = dynamic_cast<BitsInit*>(RV->getValue());
+    if (CurVal == 0)
+      return Error(Loc, "Value '" + ValName + "' is not a bits type");
+
+    // Convert the incoming value to a bits type of the appropriate size...
+    Init *BI = V->convertInitializerTo(BitsRecTy::get(BitList.size()));
+    if (BI == 0) {
+      V->convertInitializerTo(BitsRecTy::get(BitList.size()));
+      return Error(Loc, "Initializer is not compatible with bit range");
+    }
+
+    // We should have a BitsInit type now.
+    BitsInit *BInit = dynamic_cast<BitsInit*>(BI);
+    assert(BInit != 0);
+
+    SmallVector<Init *, 16> NewBits(CurVal->getNumBits());
+
+    // Loop over bits, assigning values as appropriate.
+    for (unsigned i = 0, e = BitList.size(); i != e; ++i) {
+      unsigned Bit = BitList[i];
+      if (NewBits[Bit])
+        return Error(Loc, "Cannot set bit #" + utostr(Bit) + " of value '" +
+                     ValName + "' more than once");
+      NewBits[Bit] = BInit->getBit(i);
+    }
+
+    for (unsigned i = 0, e = CurVal->getNumBits(); i != e; ++i)
+      if (NewBits[i] == 0)
+        NewBits[i] = CurVal->getBit(i);
+
+    V = BitsInit::get(NewBits);
+  }
+
+  if (RV->setValue(V))
+   return Error(Loc, "Value '" + ValName + "' of type '" +
+                RV->getType()->getAsString() +
+                "' is incompatible with initializer '" + V->getAsString() +"'");
+  return false;
+}
+
+/// AddSubClass - Add SubClass as a subclass to CurRec, resolving its template
+/// args as SubClass's template arguments.
+bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) {
+  Record *SC = SubClass.Rec;
+  // Add all of the values in the subclass into the current class.
+  const std::vector<RecordVal> &Vals = SC->getValues();
+  for (unsigned i = 0, e = Vals.size(); i != e; ++i)
+    if (AddValue(CurRec, SubClass.RefLoc, Vals[i]))
+      return true;
+
+  const std::vector<std::string> &TArgs = SC->getTemplateArgs();
+
+  // Ensure that an appropriate number of template arguments are specified.
+  if (TArgs.size() < SubClass.TemplateArgs.size())
+    return Error(SubClass.RefLoc, "More template args specified than expected");
+
+  // Loop over all of the template arguments, setting them to the specified
+  // value or leaving them as the default if necessary.
+  for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+    if (i < SubClass.TemplateArgs.size()) {
+      // If a value is specified for this template arg, set it now.
+      if (SetValue(CurRec, SubClass.RefLoc, TArgs[i], std::vector<unsigned>(),
+                   SubClass.TemplateArgs[i]))
+        return true;
+
+      // Resolve it next.
+      CurRec->resolveReferencesTo(CurRec->getValue(TArgs[i]));
+
+      // Now remove it.
+      CurRec->removeValue(TArgs[i]);
+
+    } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) {
+      return Error(SubClass.RefLoc,"Value not specified for template argument #"
+                   + utostr(i) + " (" + TArgs[i] + ") of subclass '" +
+                   SC->getName() + "'!");
+    }
+  }
+
+  // Since everything went well, we can now set the "superclass" list for the
+  // current record.
+  const std::vector<Record*> &SCs = SC->getSuperClasses();
+  for (unsigned i = 0, e = SCs.size(); i != e; ++i) {
+    if (CurRec->isSubClassOf(SCs[i]))
+      return Error(SubClass.RefLoc,
+                   "Already subclass of '" + SCs[i]->getName() + "'!\n");
+    CurRec->addSuperClass(SCs[i]);
+  }
+
+  if (CurRec->isSubClassOf(SC))
+    return Error(SubClass.RefLoc,
+                 "Already subclass of '" + SC->getName() + "'!\n");
+  CurRec->addSuperClass(SC);
+  return false;
+}
+
+/// AddSubMultiClass - Add SubMultiClass as a subclass to
+/// CurMC, resolving its template args as SubMultiClass's
+/// template arguments.
+bool TGParser::AddSubMultiClass(MultiClass *CurMC,
+                                SubMultiClassReference &SubMultiClass) {
+  MultiClass *SMC = SubMultiClass.MC;
+  Record *CurRec = &CurMC->Rec;
+
+  const std::vector<RecordVal> &MCVals = CurRec->getValues();
+
+  // Add all of the values in the subclass into the current class.
+  const std::vector<RecordVal> &SMCVals = SMC->Rec.getValues();
+  for (unsigned i = 0, e = SMCVals.size(); i != e; ++i)
+    if (AddValue(CurRec, SubMultiClass.RefLoc, SMCVals[i]))
+      return true;
+
+  int newDefStart = CurMC->DefPrototypes.size();
+
+  // Add all of the defs in the subclass into the current multiclass.
+  for (MultiClass::RecordVector::const_iterator i = SMC->DefPrototypes.begin(),
+         iend = SMC->DefPrototypes.end();
+       i != iend;
+       ++i) {
+    // Clone the def and add it to the current multiclass
+    Record *NewDef = new Record(**i);
+
+    // Add all of the values in the superclass into the current def.
+    for (unsigned i = 0, e = MCVals.size(); i != e; ++i)
+      if (AddValue(NewDef, SubMultiClass.RefLoc, MCVals[i]))
+        return true;
+
+    CurMC->DefPrototypes.push_back(NewDef);
+  }
+
+  const std::vector<std::string> &SMCTArgs = SMC->Rec.getTemplateArgs();
+
+  // Ensure that an appropriate number of template arguments are
+  // specified.
+  if (SMCTArgs.size() < SubMultiClass.TemplateArgs.size())
+    return Error(SubMultiClass.RefLoc,
+                 "More template args specified than expected");
+
+  // Loop over all of the template arguments, setting them to the specified
+  // value or leaving them as the default if necessary.
+  for (unsigned i = 0, e = SMCTArgs.size(); i != e; ++i) {
+    if (i < SubMultiClass.TemplateArgs.size()) {
+      // If a value is specified for this template arg, set it in the
+      // superclass now.
+      if (SetValue(CurRec, SubMultiClass.RefLoc, SMCTArgs[i],
+                   std::vector<unsigned>(),
+                   SubMultiClass.TemplateArgs[i]))
+        return true;
+
+      // Resolve it next.
+      CurRec->resolveReferencesTo(CurRec->getValue(SMCTArgs[i]));
+
+      // Now remove it.
+      CurRec->removeValue(SMCTArgs[i]);
+
+      // If a value is specified for this template arg, set it in the
+      // new defs now.
+      for (MultiClass::RecordVector::iterator j =
+             CurMC->DefPrototypes.begin() + newDefStart,
+             jend = CurMC->DefPrototypes.end();
+           j != jend;
+           ++j) {
+        Record *Def = *j;
+
+        if (SetValue(Def, SubMultiClass.RefLoc, SMCTArgs[i],
+                     std::vector<unsigned>(),
+                     SubMultiClass.TemplateArgs[i]))
+          return true;
+
+        // Resolve it next.
+        Def->resolveReferencesTo(Def->getValue(SMCTArgs[i]));
+
+        // Now remove it
+        Def->removeValue(SMCTArgs[i]);
+      }
+    } else if (!CurRec->getValue(SMCTArgs[i])->getValue()->isComplete()) {
+      return Error(SubMultiClass.RefLoc,
+                   "Value not specified for template argument #"
+                   + utostr(i) + " (" + SMCTArgs[i] + ") of subclass '" +
+                   SMC->Rec.getName() + "'!");
+    }
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Parser Code
+//===----------------------------------------------------------------------===//
+
+/// isObjectStart - Return true if this is a valid first token for an Object.
+static bool isObjectStart(tgtok::TokKind K) {
+  return K == tgtok::Class || K == tgtok::Def ||
+         K == tgtok::Defm || K == tgtok::Let || K == tgtok::MultiClass;
+}
+
+static std::string GetNewAnonymousName() {
+  static unsigned AnonCounter = 0;
+  return "anonymous."+utostr(AnonCounter++);
+}
+
+/// ParseObjectName - If an object name is specified, return it.  Otherwise,
+/// return an anonymous name.
+///   ObjectName ::= ID
+///   ObjectName ::= /*empty*/
+///
+std::string TGParser::ParseObjectName() {
+  if (Lex.getCode() != tgtok::Id)
+    return GetNewAnonymousName();
+
+  std::string Ret = Lex.getCurStrVal();
+  Lex.Lex();
+  return Ret;
+}
+
+
+/// ParseClassID - Parse and resolve a reference to a class name.  This returns
+/// null on error.
+///
+///    ClassID ::= ID
+///
+Record *TGParser::ParseClassID() {
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("expected name for ClassID");
+    return 0;
+  }
+
+  Record *Result = Records.getClass(Lex.getCurStrVal());
+  if (Result == 0)
+    TokError("Couldn't find class '" + Lex.getCurStrVal() + "'");
+
+  Lex.Lex();
+  return Result;
+}
+
+/// ParseMultiClassID - Parse and resolve a reference to a multiclass name.
+/// This returns null on error.
+///
+///    MultiClassID ::= ID
+///
+MultiClass *TGParser::ParseMultiClassID() {
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("expected name for ClassID");
+    return 0;
+  }
+
+  MultiClass *Result = MultiClasses[Lex.getCurStrVal()];
+  if (Result == 0)
+    TokError("Couldn't find class '" + Lex.getCurStrVal() + "'");
+
+  Lex.Lex();
+  return Result;
+}
+
+Record *TGParser::ParseDefmID() {
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("expected multiclass name");
+    return 0;
+  }
+
+  MultiClass *MC = MultiClasses[Lex.getCurStrVal()];
+  if (MC == 0) {
+    TokError("Couldn't find multiclass '" + Lex.getCurStrVal() + "'");
+    return 0;
+  }
+
+  Lex.Lex();
+  return &MC->Rec;
+}
+
+
+/// ParseSubClassReference - Parse a reference to a subclass or to a templated
+/// subclass.  This returns a SubClassRefTy with a null Record* on error.
+///
+///  SubClassRef ::= ClassID
+///  SubClassRef ::= ClassID '<' ValueList '>'
+///
+SubClassReference TGParser::
+ParseSubClassReference(Record *CurRec, bool isDefm) {
+  SubClassReference Result;
+  Result.RefLoc = Lex.getLoc();
+
+  if (isDefm)
+    Result.Rec = ParseDefmID();
+  else
+    Result.Rec = ParseClassID();
+  if (Result.Rec == 0) return Result;
+
+  // If there is no template arg list, we're done.
+  if (Lex.getCode() != tgtok::less)
+    return Result;
+  Lex.Lex();  // Eat the '<'
+
+  if (Lex.getCode() == tgtok::greater) {
+    TokError("subclass reference requires a non-empty list of template values");
+    Result.Rec = 0;
+    return Result;
+  }
+
+  Result.TemplateArgs = ParseValueList(CurRec, Result.Rec);
+  if (Result.TemplateArgs.empty()) {
+    Result.Rec = 0;   // Error parsing value list.
+    return Result;
+  }
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected '>' in template value list");
+    Result.Rec = 0;
+    return Result;
+  }
+  Lex.Lex();
+
+  return Result;
+}
+
+/// ParseSubMultiClassReference - Parse a reference to a subclass or to a
+/// templated submulticlass.  This returns a SubMultiClassRefTy with a null
+/// Record* on error.
+///
+///  SubMultiClassRef ::= MultiClassID
+///  SubMultiClassRef ::= MultiClassID '<' ValueList '>'
+///
+SubMultiClassReference TGParser::
+ParseSubMultiClassReference(MultiClass *CurMC) {
+  SubMultiClassReference Result;
+  Result.RefLoc = Lex.getLoc();
+
+  Result.MC = ParseMultiClassID();
+  if (Result.MC == 0) return Result;
+
+  // If there is no template arg list, we're done.
+  if (Lex.getCode() != tgtok::less)
+    return Result;
+  Lex.Lex();  // Eat the '<'
+
+  if (Lex.getCode() == tgtok::greater) {
+    TokError("subclass reference requires a non-empty list of template values");
+    Result.MC = 0;
+    return Result;
+  }
+
+  Result.TemplateArgs = ParseValueList(&CurMC->Rec, &Result.MC->Rec);
+  if (Result.TemplateArgs.empty()) {
+    Result.MC = 0;   // Error parsing value list.
+    return Result;
+  }
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected '>' in template value list");
+    Result.MC = 0;
+    return Result;
+  }
+  Lex.Lex();
+
+  return Result;
+}
+
+/// ParseRangePiece - Parse a bit/value range.
+///   RangePiece ::= INTVAL
+///   RangePiece ::= INTVAL '-' INTVAL
+///   RangePiece ::= INTVAL INTVAL
+bool TGParser::ParseRangePiece(std::vector<unsigned> &Ranges) {
+  if (Lex.getCode() != tgtok::IntVal) {
+    TokError("expected integer or bitrange");
+    return true;
+  }
+  int64_t Start = Lex.getCurIntVal();
+  int64_t End;
+
+  if (Start < 0)
+    return TokError("invalid range, cannot be negative");
+
+  switch (Lex.Lex()) {  // eat first character.
+  default:
+    Ranges.push_back(Start);
+    return false;
+  case tgtok::minus:
+    if (Lex.Lex() != tgtok::IntVal) {
+      TokError("expected integer value as end of range");
+      return true;
+    }
+    End = Lex.getCurIntVal();
+    break;
+  case tgtok::IntVal:
+    End = -Lex.getCurIntVal();
+    break;
+  }
+  if (End < 0)
+    return TokError("invalid range, cannot be negative");
+  Lex.Lex();
+
+  // Add to the range.
+  if (Start < End) {
+    for (; Start <= End; ++Start)
+      Ranges.push_back(Start);
+  } else {
+    for (; Start >= End; --Start)
+      Ranges.push_back(Start);
+  }
+  return false;
+}
+
+/// ParseRangeList - Parse a list of scalars and ranges into scalar values.
+///
+///   RangeList ::= RangePiece (',' RangePiece)*
+///
+std::vector<unsigned> TGParser::ParseRangeList() {
+  std::vector<unsigned> Result;
+
+  // Parse the first piece.
+  if (ParseRangePiece(Result))
+    return std::vector<unsigned>();
+  while (Lex.getCode() == tgtok::comma) {
+    Lex.Lex();  // Eat the comma.
+
+    // Parse the next range piece.
+    if (ParseRangePiece(Result))
+      return std::vector<unsigned>();
+  }
+  return Result;
+}
+
+/// ParseOptionalRangeList - Parse either a range list in <>'s or nothing.
+///   OptionalRangeList ::= '<' RangeList '>'
+///   OptionalRangeList ::= /*empty*/
+bool TGParser::ParseOptionalRangeList(std::vector<unsigned> &Ranges) {
+  if (Lex.getCode() != tgtok::less)
+    return false;
+
+  SMLoc StartLoc = Lex.getLoc();
+  Lex.Lex(); // eat the '<'
+
+  // Parse the range list.
+  Ranges = ParseRangeList();
+  if (Ranges.empty()) return true;
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected '>' at end of range list");
+    return Error(StartLoc, "to match this '<'");
+  }
+  Lex.Lex();   // eat the '>'.
+  return false;
+}
+
+/// ParseOptionalBitList - Parse either a bit list in {}'s or nothing.
+///   OptionalBitList ::= '{' RangeList '}'
+///   OptionalBitList ::= /*empty*/
+bool TGParser::ParseOptionalBitList(std::vector<unsigned> &Ranges) {
+  if (Lex.getCode() != tgtok::l_brace)
+    return false;
+
+  SMLoc StartLoc = Lex.getLoc();
+  Lex.Lex(); // eat the '{'
+
+  // Parse the range list.
+  Ranges = ParseRangeList();
+  if (Ranges.empty()) return true;
+
+  if (Lex.getCode() != tgtok::r_brace) {
+    TokError("expected '}' at end of bit list");
+    return Error(StartLoc, "to match this '{'");
+  }
+  Lex.Lex();   // eat the '}'.
+  return false;
+}
+
+
+/// ParseType - Parse and return a tblgen type.  This returns null on error.
+///
+///   Type ::= STRING                       // string type
+///   Type ::= BIT                          // bit type
+///   Type ::= BITS '<' INTVAL '>'          // bits<x> type
+///   Type ::= INT                          // int type
+///   Type ::= LIST '<' Type '>'            // list<x> type
+///   Type ::= CODE                         // code type
+///   Type ::= DAG                          // dag type
+///   Type ::= ClassID                      // Record Type
+///
+RecTy *TGParser::ParseType() {
+  switch (Lex.getCode()) {
+  default: TokError("Unknown token when expecting a type"); return 0;
+  case tgtok::String: Lex.Lex(); return StringRecTy::get();
+  case tgtok::Bit:    Lex.Lex(); return BitRecTy::get();
+  case tgtok::Int:    Lex.Lex(); return IntRecTy::get();
+  case tgtok::Code:   Lex.Lex(); return CodeRecTy::get();
+  case tgtok::Dag:    Lex.Lex(); return DagRecTy::get();
+  case tgtok::Id:
+    if (Record *R = ParseClassID()) return RecordRecTy::get(R);
+    return 0;
+  case tgtok::Bits: {
+    if (Lex.Lex() != tgtok::less) { // Eat 'bits'
+      TokError("expected '<' after bits type");
+      return 0;
+    }
+    if (Lex.Lex() != tgtok::IntVal) {  // Eat '<'
+      TokError("expected integer in bits<n> type");
+      return 0;
+    }
+    uint64_t Val = Lex.getCurIntVal();
+    if (Lex.Lex() != tgtok::greater) {  // Eat count.
+      TokError("expected '>' at end of bits<n> type");
+      return 0;
+    }
+    Lex.Lex();  // Eat '>'
+    return BitsRecTy::get(Val);
+  }
+  case tgtok::List: {
+    if (Lex.Lex() != tgtok::less) { // Eat 'bits'
+      TokError("expected '<' after list type");
+      return 0;
+    }
+    Lex.Lex();  // Eat '<'
+    RecTy *SubType = ParseType();
+    if (SubType == 0) return 0;
+
+    if (Lex.getCode() != tgtok::greater) {
+      TokError("expected '>' at end of list<ty> type");
+      return 0;
+    }
+    Lex.Lex();  // Eat '>'
+    return ListRecTy::get(SubType);
+  }
+  }
+}
+
+/// ParseIDValue - Parse an ID as a value and decode what it means.
+///
+///  IDValue ::= ID [def local value]
+///  IDValue ::= ID [def template arg]
+///  IDValue ::= ID [multiclass local value]
+///  IDValue ::= ID [multiclass template argument]
+///  IDValue ::= ID [def name]
+///
+Init *TGParser::ParseIDValue(Record *CurRec) {
+  assert(Lex.getCode() == tgtok::Id && "Expected ID in ParseIDValue");
+  std::string Name = Lex.getCurStrVal();
+  SMLoc Loc = Lex.getLoc();
+  Lex.Lex();
+  return ParseIDValue(CurRec, Name, Loc);
+}
+
+/// ParseIDValue - This is just like ParseIDValue above, but it assumes the ID
+/// has already been read.
+Init *TGParser::ParseIDValue(Record *CurRec,
+                             const std::string &Name, SMLoc NameLoc) {
+  if (CurRec) {
+    if (const RecordVal *RV = CurRec->getValue(Name))
+      return VarInit::get(Name, RV->getType());
+
+    std::string TemplateArgName = CurRec->getName()+":"+Name;
+    if (CurMultiClass)
+      TemplateArgName = CurMultiClass->Rec.getName()+"::"+TemplateArgName;
+
+    if (CurRec->isTemplateArg(TemplateArgName)) {
+      const RecordVal *RV = CurRec->getValue(TemplateArgName);
+      assert(RV && "Template arg doesn't exist??");
+      return VarInit::get(TemplateArgName, RV->getType());
+    }
+  }
+
+  if (CurMultiClass) {
+    std::string MCName = CurMultiClass->Rec.getName()+"::"+Name;
+    if (CurMultiClass->Rec.isTemplateArg(MCName)) {
+      const RecordVal *RV = CurMultiClass->Rec.getValue(MCName);
+      assert(RV && "Template arg doesn't exist??");
+      return VarInit::get(MCName, RV->getType());
+    }
+  }
+
+  if (Record *D = Records.getDef(Name))
+    return DefInit::get(D);
+
+  Error(NameLoc, "Variable not defined: '" + Name + "'");
+  return 0;
+}
+
+/// ParseOperation - Parse an operator.  This returns null on error.
+///
+/// Operation ::= XOperator ['<' Type '>'] '(' Args ')'
+///
+Init *TGParser::ParseOperation(Record *CurRec) {
+  switch (Lex.getCode()) {
+  default:
+    TokError("unknown operation");
+    return 0;
+    break;
+  case tgtok::XHead:
+  case tgtok::XTail:
+  case tgtok::XEmpty:
+  case tgtok::XCast: {  // Value ::= !unop '(' Value ')'
+    UnOpInit::UnaryOp Code;
+    RecTy *Type = 0;
+
+    switch (Lex.getCode()) {
+    default: assert(0 && "Unhandled code!");
+    case tgtok::XCast:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::CAST;
+
+      Type = ParseOperatorType();
+
+      if (Type == 0) {
+        TokError("did not get type for unary operator");
+        return 0;
+      }
+
+      break;
+    case tgtok::XHead:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::HEAD;
+      break;
+    case tgtok::XTail:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::TAIL;
+      break;
+    case tgtok::XEmpty:
+      Lex.Lex();  // eat the operation
+      Code = UnOpInit::EMPTY;
+      Type = IntRecTy::get();
+      break;
+    }
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after unary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the '('
+
+    Init *LHS = ParseValue(CurRec);
+    if (LHS == 0) return 0;
+
+    if (Code == UnOpInit::HEAD
+        || Code == UnOpInit::TAIL
+        || Code == UnOpInit::EMPTY) {
+      ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
+      StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+      TypedInit *LHSt = dynamic_cast<TypedInit*>(LHS);
+      if (LHSl == 0 && LHSs == 0 && LHSt == 0) {
+        TokError("expected list or string type argument in unary operator");
+        return 0;
+      }
+      if (LHSt) {
+        ListRecTy *LType = dynamic_cast<ListRecTy*>(LHSt->getType());
+        StringRecTy *SType = dynamic_cast<StringRecTy*>(LHSt->getType());
+        if (LType == 0 && SType == 0) {
+          TokError("expected list or string type argumnet in unary operator");
+          return 0;
+        }
+      }
+
+      if (Code == UnOpInit::HEAD
+          || Code == UnOpInit::TAIL) {
+        if (LHSl == 0 && LHSt == 0) {
+          TokError("expected list type argumnet in unary operator");
+          return 0;
+        }
+
+        if (LHSl && LHSl->getSize() == 0) {
+          TokError("empty list argument in unary operator");
+          return 0;
+        }
+        if (LHSl) {
+          Init *Item = LHSl->getElement(0);
+          TypedInit *Itemt = dynamic_cast<TypedInit*>(Item);
+          if (Itemt == 0) {
+            TokError("untyped list element in unary operator");
+            return 0;
+          }
+          if (Code == UnOpInit::HEAD) {
+            Type = Itemt->getType();
+          } else {
+            Type = ListRecTy::get(Itemt->getType());
+          }
+        } else {
+          assert(LHSt && "expected list type argument in unary operator");
+          ListRecTy *LType = dynamic_cast<ListRecTy*>(LHSt->getType());
+          if (LType == 0) {
+            TokError("expected list type argumnet in unary operator");
+            return 0;
+          }
+          if (Code == UnOpInit::HEAD) {
+            Type = LType->getElementType();
+          } else {
+            Type = LType;
+          }
+        }
+      }
+    }
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in unary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+    return (UnOpInit::get(Code, LHS, Type))->Fold(CurRec, CurMultiClass);
+  }
+
+  case tgtok::XConcat:
+  case tgtok::XSRA:
+  case tgtok::XSRL:
+  case tgtok::XSHL:
+  case tgtok::XEq:
+  case tgtok::XStrConcat: {  // Value ::= !binop '(' Value ',' Value ')'
+    tgtok::TokKind OpTok = Lex.getCode();
+    SMLoc OpLoc = Lex.getLoc();
+    Lex.Lex();  // eat the operation
+
+    BinOpInit::BinaryOp Code;
+    RecTy *Type = 0;
+
+    switch (OpTok) {
+    default: assert(0 && "Unhandled code!");
+    case tgtok::XConcat: Code = BinOpInit::CONCAT;Type = DagRecTy::get(); break;
+    case tgtok::XSRA:    Code = BinOpInit::SRA;   Type = IntRecTy::get(); break;
+    case tgtok::XSRL:    Code = BinOpInit::SRL;   Type = IntRecTy::get(); break;
+    case tgtok::XSHL:    Code = BinOpInit::SHL;   Type = IntRecTy::get(); break;
+    case tgtok::XEq:     Code = BinOpInit::EQ;    Type = BitRecTy::get(); break;
+    case tgtok::XStrConcat:
+      Code = BinOpInit::STRCONCAT;
+      Type = StringRecTy::get();
+      break;
+    }
+
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after binary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the '('
+
+    SmallVector<Init*, 2> InitList;
+
+    InitList.push_back(ParseValue(CurRec));
+    if (InitList.back() == 0) return 0;
+
+    while (Lex.getCode() == tgtok::comma) {
+      Lex.Lex();  // eat the ','
+
+      InitList.push_back(ParseValue(CurRec));
+      if (InitList.back() == 0) return 0;
+    }
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+
+    // We allow multiple operands to associative operators like !strconcat as
+    // shorthand for nesting them.
+    if (Code == BinOpInit::STRCONCAT) {
+      while (InitList.size() > 2) {
+        Init *RHS = InitList.pop_back_val();
+        RHS = (BinOpInit::get(Code, InitList.back(), RHS, Type))
+                           ->Fold(CurRec, CurMultiClass);
+        InitList.back() = RHS;
+      }
+    }
+
+    if (InitList.size() == 2)
+      return (BinOpInit::get(Code, InitList[0], InitList[1], Type))
+        ->Fold(CurRec, CurMultiClass);
+
+    Error(OpLoc, "expected two operands to operator");
+    return 0;
+  }
+
+  case tgtok::XIf:
+  case tgtok::XForEach:
+  case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
+    TernOpInit::TernaryOp Code;
+    RecTy *Type = 0;
+
+    tgtok::TokKind LexCode = Lex.getCode();
+    Lex.Lex();  // eat the operation
+    switch (LexCode) {
+    default: assert(0 && "Unhandled code!");
+    case tgtok::XIf:
+      Code = TernOpInit::IF;
+      break;
+    case tgtok::XForEach:
+      Code = TernOpInit::FOREACH;
+      break;
+    case tgtok::XSubst:
+      Code = TernOpInit::SUBST;
+      break;
+    }
+    if (Lex.getCode() != tgtok::l_paren) {
+      TokError("expected '(' after ternary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the '('
+
+    Init *LHS = ParseValue(CurRec);
+    if (LHS == 0) return 0;
+
+    if (Lex.getCode() != tgtok::comma) {
+      TokError("expected ',' in ternary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ','
+
+    Init *MHS = ParseValue(CurRec);
+    if (MHS == 0) return 0;
+
+    if (Lex.getCode() != tgtok::comma) {
+      TokError("expected ',' in ternary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ','
+
+    Init *RHS = ParseValue(CurRec);
+    if (RHS == 0) return 0;
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in binary operator");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+
+    switch (LexCode) {
+    default: assert(0 && "Unhandled code!");
+    case tgtok::XIf: {
+      // FIXME: The `!if' operator doesn't handle non-TypedInit well at
+      // all. This can be made much more robust.
+      TypedInit *MHSt = dynamic_cast<TypedInit*>(MHS);
+      TypedInit *RHSt = dynamic_cast<TypedInit*>(RHS);
+
+      RecTy *MHSTy = 0;
+      RecTy *RHSTy = 0;
+
+      if (MHSt == 0 && RHSt == 0) {
+        BitsInit *MHSbits = dynamic_cast<BitsInit*>(MHS);
+        BitsInit *RHSbits = dynamic_cast<BitsInit*>(RHS);
+
+        if (MHSbits && RHSbits &&
+            MHSbits->getNumBits() == RHSbits->getNumBits()) {
+          Type = BitRecTy::get();
+          break;
+        } else {
+          BitInit *MHSbit = dynamic_cast<BitInit*>(MHS);
+          BitInit *RHSbit = dynamic_cast<BitInit*>(RHS);
+
+          if (MHSbit && RHSbit) {
+            Type = BitRecTy::get();
+            break;
+          }
+        }
+      } else if (MHSt != 0 && RHSt != 0) {
+        MHSTy = MHSt->getType();
+        RHSTy = RHSt->getType();
+      }
+
+      if (!MHSTy || !RHSTy) {
+        TokError("could not get type for !if");
+        return 0;
+      }
+
+      if (MHSTy->typeIsConvertibleTo(RHSTy)) {
+        Type = RHSTy;
+      } else if (RHSTy->typeIsConvertibleTo(MHSTy)) {
+        Type = MHSTy;
+      } else {
+        TokError("inconsistent types for !if");
+        return 0;
+      }
+      break;
+    }
+    case tgtok::XForEach: {
+      TypedInit *MHSt = dynamic_cast<TypedInit *>(MHS);
+      if (MHSt == 0) {
+        TokError("could not get type for !foreach");
+        return 0;
+      }
+      Type = MHSt->getType();
+      break;
+    }
+    case tgtok::XSubst: {
+      TypedInit *RHSt = dynamic_cast<TypedInit *>(RHS);
+      if (RHSt == 0) {
+        TokError("could not get type for !subst");
+        return 0;
+      }
+      Type = RHSt->getType();
+      break;
+    }
+    }
+    return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec,
+                                                             CurMultiClass);
+  }
+  }
+  TokError("could not parse operation");
+  return 0;
+}
+
+/// ParseOperatorType - Parse a type for an operator.  This returns
+/// null on error.
+///
+/// OperatorType ::= '<' Type '>'
+///
+RecTy *TGParser::ParseOperatorType() {
+  RecTy *Type = 0;
+
+  if (Lex.getCode() != tgtok::less) {
+    TokError("expected type name for operator");
+    return 0;
+  }
+  Lex.Lex();  // eat the <
+
+  Type = ParseType();
+
+  if (Type == 0) {
+    TokError("expected type name for operator");
+    return 0;
+  }
+
+  if (Lex.getCode() != tgtok::greater) {
+    TokError("expected type name for operator");
+    return 0;
+  }
+  Lex.Lex();  // eat the >
+
+  return Type;
+}
+
+
+/// ParseSimpleValue - Parse a tblgen value.  This returns null on error.
+///
+///   SimpleValue ::= IDValue
+///   SimpleValue ::= INTVAL
+///   SimpleValue ::= STRVAL+
+///   SimpleValue ::= CODEFRAGMENT
+///   SimpleValue ::= '?'
+///   SimpleValue ::= '{' ValueList '}'
+///   SimpleValue ::= ID '<' ValueListNE '>'
+///   SimpleValue ::= '[' ValueList ']'
+///   SimpleValue ::= '(' IDValue DagArgList ')'
+///   SimpleValue ::= CONCATTOK '(' Value ',' Value ')'
+///   SimpleValue ::= SHLTOK '(' Value ',' Value ')'
+///   SimpleValue ::= SRATOK '(' Value ',' Value ')'
+///   SimpleValue ::= SRLTOK '(' Value ',' Value ')'
+///   SimpleValue ::= STRCONCATTOK '(' Value ',' Value ')'
+///
+Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType) {
+  Init *R = 0;
+  switch (Lex.getCode()) {
+  default: TokError("Unknown token when parsing a value"); break;
+  case tgtok::IntVal: R = IntInit::get(Lex.getCurIntVal()); Lex.Lex(); break;
+  case tgtok::StrVal: {
+    std::string Val = Lex.getCurStrVal();
+    Lex.Lex();
+
+    // Handle multiple consecutive concatenated strings.
+    while (Lex.getCode() == tgtok::StrVal) {
+      Val += Lex.getCurStrVal();
+      Lex.Lex();
+    }
+
+    R = StringInit::get(Val);
+    break;
+  }
+  case tgtok::CodeFragment:
+    R = CodeInit::get(Lex.getCurStrVal());
+    Lex.Lex();
+    break;
+  case tgtok::question:
+    R = UnsetInit::get();
+    Lex.Lex();
+    break;
+  case tgtok::Id: {
+    SMLoc NameLoc = Lex.getLoc();
+    std::string Name = Lex.getCurStrVal();
+    if (Lex.Lex() != tgtok::less)  // consume the Id.
+      return ParseIDValue(CurRec, Name, NameLoc);    // Value ::= IDValue
+
+    // Value ::= ID '<' ValueListNE '>'
+    if (Lex.Lex() == tgtok::greater) {
+      TokError("expected non-empty value list");
+      return 0;
+    }
+
+    // This is a CLASS<initvalslist> expression.  This is supposed to synthesize
+    // a new anonymous definition, deriving from CLASS<initvalslist> with no
+    // body.
+    Record *Class = Records.getClass(Name);
+    if (!Class) {
+      Error(NameLoc, "Expected a class name, got '" + Name + "'");
+      return 0;
+    }
+
+    std::vector<Init*> ValueList = ParseValueList(CurRec, Class);
+    if (ValueList.empty()) return 0;
+
+    if (Lex.getCode() != tgtok::greater) {
+      TokError("expected '>' at end of value list");
+      return 0;
+    }
+    Lex.Lex();  // eat the '>'
+
+    // Create the new record, set it as CurRec temporarily.
+    static unsigned AnonCounter = 0;
+    Record *NewRec = new Record("anonymous.val."+utostr(AnonCounter++),
+                                NameLoc,
+                                Records);
+    SubClassReference SCRef;
+    SCRef.RefLoc = NameLoc;
+    SCRef.Rec = Class;
+    SCRef.TemplateArgs = ValueList;
+    // Add info about the subclass to NewRec.
+    if (AddSubClass(NewRec, SCRef))
+      return 0;
+    NewRec->resolveReferences();
+    Records.addDef(NewRec);
+
+    // The result of the expression is a reference to the new record.
+    return DefInit::get(NewRec);
+  }
+  case tgtok::l_brace: {           // Value ::= '{' ValueList '}'
+    SMLoc BraceLoc = Lex.getLoc();
+    Lex.Lex(); // eat the '{'
+    std::vector<Init*> Vals;
+
+    if (Lex.getCode() != tgtok::r_brace) {
+      Vals = ParseValueList(CurRec);
+      if (Vals.empty()) return 0;
+    }
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of bit list value");
+      return 0;
+    }
+    Lex.Lex();  // eat the '}'
+
+    SmallVector<Init *, 16> NewBits(Vals.size());
+
+    for (unsigned i = 0, e = Vals.size(); i != e; ++i) {
+      Init *Bit = Vals[i]->convertInitializerTo(BitRecTy::get());
+      if (Bit == 0) {
+        Error(BraceLoc, "Element #" + utostr(i) + " (" + Vals[i]->getAsString()+
+              ") is not convertable to a bit");
+        return 0;
+      }
+      NewBits[Vals.size()-i-1] = Bit;
+    }
+    return BitsInit::get(NewBits);
+  }
+  case tgtok::l_square: {          // Value ::= '[' ValueList ']'
+    Lex.Lex(); // eat the '['
+    std::vector<Init*> Vals;
+
+    RecTy *DeducedEltTy = 0;
+    ListRecTy *GivenListTy = 0;
+
+    if (ItemType != 0) {
+      ListRecTy *ListType = dynamic_cast<ListRecTy*>(ItemType);
+      if (ListType == 0) {
+        std::stringstream s;
+        s << "Type mismatch for list, expected list type, got "
+          << ItemType->getAsString();
+        TokError(s.str());
+        return 0;
+      }
+      GivenListTy = ListType;
+    }
+
+    if (Lex.getCode() != tgtok::r_square) {
+      Vals = ParseValueList(CurRec, 0,
+                            GivenListTy ? GivenListTy->getElementType() : 0);
+      if (Vals.empty()) return 0;
+    }
+    if (Lex.getCode() != tgtok::r_square) {
+      TokError("expected ']' at end of list value");
+      return 0;
+    }
+    Lex.Lex();  // eat the ']'
+
+    RecTy *GivenEltTy = 0;
+    if (Lex.getCode() == tgtok::less) {
+      // Optional list element type
+      Lex.Lex();  // eat the '<'
+
+      GivenEltTy = ParseType();
+      if (GivenEltTy == 0) {
+        // Couldn't parse element type
+        return 0;
+      }
+
+      if (Lex.getCode() != tgtok::greater) {
+        TokError("expected '>' at end of list element type");
+        return 0;
+      }
+      Lex.Lex();  // eat the '>'
+    }
+
+    // Check elements
+    RecTy *EltTy = 0;
+    for (std::vector<Init *>::iterator i = Vals.begin(), ie = Vals.end();
+         i != ie;
+         ++i) {
+      TypedInit *TArg = dynamic_cast<TypedInit*>(*i);
+      if (TArg == 0) {
+        TokError("Untyped list element");
+        return 0;
+      }
+      if (EltTy != 0) {
+        EltTy = resolveTypes(EltTy, TArg->getType());
+        if (EltTy == 0) {
+          TokError("Incompatible types in list elements");
+          return 0;
+        }
+      } else {
+        EltTy = TArg->getType();
+      }
+    }
+
+    if (GivenEltTy != 0) {
+      if (EltTy != 0) {
+        // Verify consistency
+        if (!EltTy->typeIsConvertibleTo(GivenEltTy)) {
+          TokError("Incompatible types in list elements");
+          return 0;
+        }
+      }
+      EltTy = GivenEltTy;
+    }
+
+    if (EltTy == 0) {
+      if (ItemType == 0) {
+        TokError("No type for list");
+        return 0;
+      }
+      DeducedEltTy = GivenListTy->getElementType();
+    } else {
+      // Make sure the deduced type is compatible with the given type
+      if (GivenListTy) {
+        if (!EltTy->typeIsConvertibleTo(GivenListTy->getElementType())) {
+          TokError("Element type mismatch for list");
+          return 0;
+        }
+      }
+      DeducedEltTy = EltTy;
+    }
+
+    return ListInit::get(Vals, DeducedEltTy);
+  }
+  case tgtok::l_paren: {         // Value ::= '(' IDValue DagArgList ')'
+    Lex.Lex();   // eat the '('
+    if (Lex.getCode() != tgtok::Id && Lex.getCode() != tgtok::XCast) {
+      TokError("expected identifier in dag init");
+      return 0;
+    }
+
+    Init *Operator = ParseValue(CurRec);
+    if (Operator == 0) return 0;
+
+    // If the operator name is present, parse it.
+    std::string OperatorName;
+    if (Lex.getCode() == tgtok::colon) {
+      if (Lex.Lex() != tgtok::VarName) { // eat the ':'
+        TokError("expected variable name in dag operator");
+        return 0;
+      }
+      OperatorName = Lex.getCurStrVal();
+      Lex.Lex();  // eat the VarName.
+    }
+
+    std::vector<std::pair<llvm::Init*, std::string> > DagArgs;
+    if (Lex.getCode() != tgtok::r_paren) {
+      DagArgs = ParseDagArgList(CurRec);
+      if (DagArgs.empty()) return 0;
+    }
+
+    if (Lex.getCode() != tgtok::r_paren) {
+      TokError("expected ')' in dag init");
+      return 0;
+    }
+    Lex.Lex();  // eat the ')'
+
+    return DagInit::get(Operator, OperatorName, DagArgs);
+  }
+
+  case tgtok::XHead:
+  case tgtok::XTail:
+  case tgtok::XEmpty:
+  case tgtok::XCast:  // Value ::= !unop '(' Value ')'
+  case tgtok::XConcat:
+  case tgtok::XSRA:
+  case tgtok::XSRL:
+  case tgtok::XSHL:
+  case tgtok::XEq:
+  case tgtok::XStrConcat:   // Value ::= !binop '(' Value ',' Value ')'
+  case tgtok::XIf:
+  case tgtok::XForEach:
+  case tgtok::XSubst: {  // Value ::= !ternop '(' Value ',' Value ',' Value ')'
+    return ParseOperation(CurRec);
+  }
+  }
+
+  return R;
+}
+
+/// ParseValue - Parse a tblgen value.  This returns null on error.
+///
+///   Value       ::= SimpleValue ValueSuffix*
+///   ValueSuffix ::= '{' BitList '}'
+///   ValueSuffix ::= '[' BitList ']'
+///   ValueSuffix ::= '.' ID
+///
+Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType) {
+  Init *Result = ParseSimpleValue(CurRec, ItemType);
+  if (Result == 0) return 0;
+
+  // Parse the suffixes now if present.
+  while (1) {
+    switch (Lex.getCode()) {
+    default: return Result;
+    case tgtok::l_brace: {
+      SMLoc CurlyLoc = Lex.getLoc();
+      Lex.Lex(); // eat the '{'
+      std::vector<unsigned> Ranges = ParseRangeList();
+      if (Ranges.empty()) return 0;
+
+      // Reverse the bitlist.
+      std::reverse(Ranges.begin(), Ranges.end());
+      Result = Result->convertInitializerBitRange(Ranges);
+      if (Result == 0) {
+        Error(CurlyLoc, "Invalid bit range for value");
+        return 0;
+      }
+
+      // Eat the '}'.
+      if (Lex.getCode() != tgtok::r_brace) {
+        TokError("expected '}' at end of bit range list");
+        return 0;
+      }
+      Lex.Lex();
+      break;
+    }
+    case tgtok::l_square: {
+      SMLoc SquareLoc = Lex.getLoc();
+      Lex.Lex(); // eat the '['
+      std::vector<unsigned> Ranges = ParseRangeList();
+      if (Ranges.empty()) return 0;
+
+      Result = Result->convertInitListSlice(Ranges);
+      if (Result == 0) {
+        Error(SquareLoc, "Invalid range for list slice");
+        return 0;
+      }
+
+      // Eat the ']'.
+      if (Lex.getCode() != tgtok::r_square) {
+        TokError("expected ']' at end of list slice");
+        return 0;
+      }
+      Lex.Lex();
+      break;
+    }
+    case tgtok::period:
+      if (Lex.Lex() != tgtok::Id) {  // eat the .
+        TokError("expected field identifier after '.'");
+        return 0;
+      }
+      if (!Result->getFieldType(Lex.getCurStrVal())) {
+        TokError("Cannot access field '" + Lex.getCurStrVal() + "' of value '" +
+                 Result->getAsString() + "'");
+        return 0;
+      }
+      Result = FieldInit::get(Result, Lex.getCurStrVal());
+      Lex.Lex();  // eat field name
+      break;
+    }
+  }
+}
+
+/// ParseDagArgList - Parse the argument list for a dag literal expression.
+///
+///    ParseDagArgList ::= Value (':' VARNAME)?
+///    ParseDagArgList ::= ParseDagArgList ',' Value (':' VARNAME)?
+std::vector<std::pair<llvm::Init*, std::string> >
+TGParser::ParseDagArgList(Record *CurRec) {
+  std::vector<std::pair<llvm::Init*, std::string> > Result;
+
+  while (1) {
+    Init *Val = ParseValue(CurRec);
+    if (Val == 0) return std::vector<std::pair<llvm::Init*, std::string> >();
+
+    // If the variable name is present, add it.
+    std::string VarName;
+    if (Lex.getCode() == tgtok::colon) {
+      if (Lex.Lex() != tgtok::VarName) { // eat the ':'
+        TokError("expected variable name in dag literal");
+        return std::vector<std::pair<llvm::Init*, std::string> >();
+      }
+      VarName = Lex.getCurStrVal();
+      Lex.Lex();  // eat the VarName.
+    }
+
+    Result.push_back(std::make_pair(Val, VarName));
+
+    if (Lex.getCode() != tgtok::comma) break;
+    Lex.Lex(); // eat the ','
+  }
+
+  return Result;
+}
+
+
+/// ParseValueList - Parse a comma separated list of values, returning them as a
+/// vector.  Note that this always expects to be able to parse at least one
+/// value.  It returns an empty list if this is not possible.
+///
+///   ValueList ::= Value (',' Value)
+///
+std::vector<Init*> TGParser::ParseValueList(Record *CurRec, Record *ArgsRec,
+                                            RecTy *EltTy) {
+  std::vector<Init*> Result;
+  RecTy *ItemType = EltTy;
+  unsigned int ArgN = 0;
+  if (ArgsRec != 0 && EltTy == 0) {
+    const std::vector<std::string> &TArgs = ArgsRec->getTemplateArgs();
+    const RecordVal *RV = ArgsRec->getValue(TArgs[ArgN]);
+    if (!RV) {
+      errs() << "Cannot find template arg " << ArgN << " (" << TArgs[ArgN]
+        << ")\n";
+    }
+    assert(RV && "Template argument record not found??");
+    ItemType = RV->getType();
+    ++ArgN;
+  }
+  Result.push_back(ParseValue(CurRec, ItemType));
+  if (Result.back() == 0) return std::vector<Init*>();
+
+  while (Lex.getCode() == tgtok::comma) {
+    Lex.Lex();  // Eat the comma
+
+    if (ArgsRec != 0 && EltTy == 0) {
+      const std::vector<std::string> &TArgs = ArgsRec->getTemplateArgs();
+      if (ArgN >= TArgs.size()) {
+        TokError("too many template arguments");
+        return std::vector<Init*>();
+      }
+      const RecordVal *RV = ArgsRec->getValue(TArgs[ArgN]);
+      assert(RV && "Template argument record not found??");
+      ItemType = RV->getType();
+      ++ArgN;
+    }
+    Result.push_back(ParseValue(CurRec, ItemType));
+    if (Result.back() == 0) return std::vector<Init*>();
+  }
+
+  return Result;
+}
+
+
+/// ParseDeclaration - Read a declaration, returning the name of field ID, or an
+/// empty string on error.  This can happen in a number of different context's,
+/// including within a def or in the template args for a def (which which case
+/// CurRec will be non-null) and within the template args for a multiclass (in
+/// which case CurRec will be null, but CurMultiClass will be set).  This can
+/// also happen within a def that is within a multiclass, which will set both
+/// CurRec and CurMultiClass.
+///
+///  Declaration ::= FIELD? Type ID ('=' Value)?
+///
+std::string TGParser::ParseDeclaration(Record *CurRec,
+                                       bool ParsingTemplateArgs) {
+  // Read the field prefix if present.
+  bool HasField = Lex.getCode() == tgtok::Field;
+  if (HasField) Lex.Lex();
+
+  RecTy *Type = ParseType();
+  if (Type == 0) return "";
+
+  if (Lex.getCode() != tgtok::Id) {
+    TokError("Expected identifier in declaration");
+    return "";
+  }
+
+  SMLoc IdLoc = Lex.getLoc();
+  std::string DeclName = Lex.getCurStrVal();
+  Lex.Lex();
+
+  if (ParsingTemplateArgs) {
+    if (CurRec) {
+      DeclName = CurRec->getName() + ":" + DeclName;
+    } else {
+      assert(CurMultiClass);
+    }
+    if (CurMultiClass)
+      DeclName = CurMultiClass->Rec.getName() + "::" + DeclName;
+  }
+
+  // Add the value.
+  if (AddValue(CurRec, IdLoc, RecordVal(DeclName, Type, HasField)))
+    return "";
+
+  // If a value is present, parse it.
+  if (Lex.getCode() == tgtok::equal) {
+    Lex.Lex();
+    SMLoc ValLoc = Lex.getLoc();
+    Init *Val = ParseValue(CurRec, Type);
+    if (Val == 0 ||
+        SetValue(CurRec, ValLoc, DeclName, std::vector<unsigned>(), Val))
+      return "";
+  }
+
+  return DeclName;
+}
+
+/// ParseTemplateArgList - Read a template argument list, which is a non-empty
+/// sequence of template-declarations in <>'s.  If CurRec is non-null, these are
+/// template args for a def, which may or may not be in a multiclass.  If null,
+/// these are the template args for a multiclass.
+///
+///    TemplateArgList ::= '<' Declaration (',' Declaration)* '>'
+///
+bool TGParser::ParseTemplateArgList(Record *CurRec) {
+  assert(Lex.getCode() == tgtok::less && "Not a template arg list!");
+  Lex.Lex(); // eat the '<'
+
+  Record *TheRecToAddTo = CurRec ? CurRec : &CurMultiClass->Rec;
+
+  // Read the first declaration.
+  std::string TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
+  if (TemplArg.empty())
+    return true;
+
+  TheRecToAddTo->addTemplateArg(TemplArg);
+
+  while (Lex.getCode() == tgtok::comma) {
+    Lex.Lex(); // eat the ','
+
+    // Read the following declarations.
+    TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
+    if (TemplArg.empty())
+      return true;
+    TheRecToAddTo->addTemplateArg(TemplArg);
+  }
+
+  if (Lex.getCode() != tgtok::greater)
+    return TokError("expected '>' at end of template argument list");
+  Lex.Lex(); // eat the '>'.
+  return false;
+}
+
+
+/// ParseBodyItem - Parse a single item at within the body of a def or class.
+///
+///   BodyItem ::= Declaration ';'
+///   BodyItem ::= LET ID OptionalBitList '=' Value ';'
+bool TGParser::ParseBodyItem(Record *CurRec) {
+  if (Lex.getCode() != tgtok::Let) {
+    if (ParseDeclaration(CurRec, false).empty())
+      return true;
+
+    if (Lex.getCode() != tgtok::semi)
+      return TokError("expected ';' after declaration");
+    Lex.Lex();
+    return false;
+  }
+
+  // LET ID OptionalRangeList '=' Value ';'
+  if (Lex.Lex() != tgtok::Id)
+    return TokError("expected field identifier after let");
+
+  SMLoc IdLoc = Lex.getLoc();
+  std::string FieldName = Lex.getCurStrVal();
+  Lex.Lex();  // eat the field name.
+
+  std::vector<unsigned> BitList;
+  if (ParseOptionalBitList(BitList))
+    return true;
+  std::reverse(BitList.begin(), BitList.end());
+
+  if (Lex.getCode() != tgtok::equal)
+    return TokError("expected '=' in let expression");
+  Lex.Lex();  // eat the '='.
+
+  RecordVal *Field = CurRec->getValue(FieldName);
+  if (Field == 0)
+    return TokError("Value '" + FieldName + "' unknown!");
+
+  RecTy *Type = Field->getType();
+
+  Init *Val = ParseValue(CurRec, Type);
+  if (Val == 0) return true;
+
+  if (Lex.getCode() != tgtok::semi)
+    return TokError("expected ';' after let expression");
+  Lex.Lex();
+
+  return SetValue(CurRec, IdLoc, FieldName, BitList, Val);
+}
+
+/// ParseBody - Read the body of a class or def.  Return true on error, false on
+/// success.
+///
+///   Body     ::= ';'
+///   Body     ::= '{' BodyList '}'
+///   BodyList BodyItem*
+///
+bool TGParser::ParseBody(Record *CurRec) {
+  // If this is a null definition, just eat the semi and return.
+  if (Lex.getCode() == tgtok::semi) {
+    Lex.Lex();
+    return false;
+  }
+
+  if (Lex.getCode() != tgtok::l_brace)
+    return TokError("Expected ';' or '{' to start body");
+  // Eat the '{'.
+  Lex.Lex();
+
+  while (Lex.getCode() != tgtok::r_brace)
+    if (ParseBodyItem(CurRec))
+      return true;
+
+  // Eat the '}'.
+  Lex.Lex();
+  return false;
+}
+
+/// ParseObjectBody - Parse the body of a def or class.  This consists of an
+/// optional ClassList followed by a Body.  CurRec is the current def or class
+/// that is being parsed.
+///
+///   ObjectBody      ::= BaseClassList Body
+///   BaseClassList   ::= /*empty*/
+///   BaseClassList   ::= ':' BaseClassListNE
+///   BaseClassListNE ::= SubClassRef (',' SubClassRef)*
+///
+bool TGParser::ParseObjectBody(Record *CurRec) {
+  // If there is a baseclass list, read it.
+  if (Lex.getCode() == tgtok::colon) {
+    Lex.Lex();
+
+    // Read all of the subclasses.
+    SubClassReference SubClass = ParseSubClassReference(CurRec, false);
+    while (1) {
+      // Check for error.
+      if (SubClass.Rec == 0) return true;
+
+      // Add it.
+      if (AddSubClass(CurRec, SubClass))
+        return true;
+
+      if (Lex.getCode() != tgtok::comma) break;
+      Lex.Lex(); // eat ','.
+      SubClass = ParseSubClassReference(CurRec, false);
+    }
+  }
+
+  // Process any variables on the let stack.
+  for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
+    for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
+      if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
+                   LetStack[i][j].Bits, LetStack[i][j].Value))
+        return true;
+
+  return ParseBody(CurRec);
+}
+
+/// ParseDef - Parse and return a top level or multiclass def, return the record
+/// corresponding to it.  This returns null on error.
+///
+///   DefInst ::= DEF ObjectName ObjectBody
+///
+bool TGParser::ParseDef(MultiClass *CurMultiClass) {
+  SMLoc DefLoc = Lex.getLoc();
+  assert(Lex.getCode() == tgtok::Def && "Unknown tok");
+  Lex.Lex();  // Eat the 'def' token.
+
+  // Parse ObjectName and make a record for it.
+  Record *CurRec = new Record(ParseObjectName(), DefLoc, Records);
+
+  if (!CurMultiClass) {
+    // Top-level def definition.
+
+    // Ensure redefinition doesn't happen.
+    if (Records.getDef(CurRec->getName())) {
+      Error(DefLoc, "def '" + CurRec->getName() + "' already defined");
+      return true;
+    }
+    Records.addDef(CurRec);
+  } else {
+    // Otherwise, a def inside a multiclass, add it to the multiclass.
+    for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size(); i != e; ++i)
+      if (CurMultiClass->DefPrototypes[i]->getName() == CurRec->getName()) {
+        Error(DefLoc, "def '" + CurRec->getName() +
+              "' already defined in this multiclass!");
+        return true;
+      }
+    CurMultiClass->DefPrototypes.push_back(CurRec);
+  }
+
+  if (ParseObjectBody(CurRec))
+    return true;
+
+  if (CurMultiClass == 0)  // Def's in multiclasses aren't really defs.
+    // See Record::setName().  This resolve step will see any new name
+    // for the def that might have been created when resolving
+    // inheritance, values and arguments above.
+    CurRec->resolveReferences();
+
+  // If ObjectBody has template arguments, it's an error.
+  assert(CurRec->getTemplateArgs().empty() && "How'd this get template args?");
+
+  if (CurMultiClass) {
+    // Copy the template arguments for the multiclass into the def.
+    const std::vector<std::string> &TArgs =
+                                CurMultiClass->Rec.getTemplateArgs();
+
+    for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+      const RecordVal *RV = CurMultiClass->Rec.getValue(TArgs[i]);
+      assert(RV && "Template arg doesn't exist?");
+      CurRec->addValue(*RV);
+    }
+  }
+
+  return false;
+}
+
+/// ParseClass - Parse a tblgen class definition.
+///
+///   ClassInst ::= CLASS ID TemplateArgList? ObjectBody
+///
+bool TGParser::ParseClass() {
+  assert(Lex.getCode() == tgtok::Class && "Unexpected token!");
+  Lex.Lex();
+
+  if (Lex.getCode() != tgtok::Id)
+    return TokError("expected class name after 'class' keyword");
+
+  Record *CurRec = Records.getClass(Lex.getCurStrVal());
+  if (CurRec) {
+    // If the body was previously defined, this is an error.
+    if (!CurRec->getValues().empty() ||
+        !CurRec->getSuperClasses().empty() ||
+        !CurRec->getTemplateArgs().empty())
+      return TokError("Class '" + CurRec->getName() + "' already defined");
+  } else {
+    // If this is the first reference to this class, create and add it.
+    CurRec = new Record(Lex.getCurStrVal(), Lex.getLoc(), Records);
+    Records.addClass(CurRec);
+  }
+  Lex.Lex(); // eat the name.
+
+  // If there are template args, parse them.
+  if (Lex.getCode() == tgtok::less)
+    if (ParseTemplateArgList(CurRec))
+      return true;
+
+  // Finally, parse the object body.
+  return ParseObjectBody(CurRec);
+}
+
+/// ParseLetList - Parse a non-empty list of assignment expressions into a list
+/// of LetRecords.
+///
+///   LetList ::= LetItem (',' LetItem)*
+///   LetItem ::= ID OptionalRangeList '=' Value
+///
+std::vector<LetRecord> TGParser::ParseLetList() {
+  std::vector<LetRecord> Result;
+
+  while (1) {
+    if (Lex.getCode() != tgtok::Id) {
+      TokError("expected identifier in let definition");
+      return std::vector<LetRecord>();
+    }
+    std::string Name = Lex.getCurStrVal();
+    SMLoc NameLoc = Lex.getLoc();
+    Lex.Lex();  // Eat the identifier.
+
+    // Check for an optional RangeList.
+    std::vector<unsigned> Bits;
+    if (ParseOptionalRangeList(Bits))
+      return std::vector<LetRecord>();
+    std::reverse(Bits.begin(), Bits.end());
+
+    if (Lex.getCode() != tgtok::equal) {
+      TokError("expected '=' in let expression");
+      return std::vector<LetRecord>();
+    }
+    Lex.Lex();  // eat the '='.
+
+    Init *Val = ParseValue(0);
+    if (Val == 0) return std::vector<LetRecord>();
+
+    // Now that we have everything, add the record.
+    Result.push_back(LetRecord(Name, Bits, Val, NameLoc));
+
+    if (Lex.getCode() != tgtok::comma)
+      return Result;
+    Lex.Lex();  // eat the comma.
+  }
+}
+
+/// ParseTopLevelLet - Parse a 'let' at top level.  This can be a couple of
+/// different related productions. This works inside multiclasses too.
+///
+///   Object ::= LET LetList IN '{' ObjectList '}'
+///   Object ::= LET LetList IN Object
+///
+bool TGParser::ParseTopLevelLet(MultiClass *CurMultiClass) {
+  assert(Lex.getCode() == tgtok::Let && "Unexpected token");
+  Lex.Lex();
+
+  // Add this entry to the let stack.
+  std::vector<LetRecord> LetInfo = ParseLetList();
+  if (LetInfo.empty()) return true;
+  LetStack.push_back(LetInfo);
+
+  if (Lex.getCode() != tgtok::In)
+    return TokError("expected 'in' at end of top-level 'let'");
+  Lex.Lex();
+
+  // If this is a scalar let, just handle it now
+  if (Lex.getCode() != tgtok::l_brace) {
+    // LET LetList IN Object
+    if (ParseObject(CurMultiClass))
+      return true;
+  } else {   // Object ::= LETCommand '{' ObjectList '}'
+    SMLoc BraceLoc = Lex.getLoc();
+    // Otherwise, this is a group let.
+    Lex.Lex();  // eat the '{'.
+
+    // Parse the object list.
+    if (ParseObjectList(CurMultiClass))
+      return true;
+
+    if (Lex.getCode() != tgtok::r_brace) {
+      TokError("expected '}' at end of top level let command");
+      return Error(BraceLoc, "to match this '{'");
+    }
+    Lex.Lex();
+  }
+
+  // Outside this let scope, this let block is not active.
+  LetStack.pop_back();
+  return false;
+}
+
+/// ParseMultiClass - Parse a multiclass definition.
+///
+///  MultiClassInst ::= MULTICLASS ID TemplateArgList?
+///                     ':' BaseMultiClassList '{' MultiClassDef+ '}'
+///
+bool TGParser::ParseMultiClass() {
+  assert(Lex.getCode() == tgtok::MultiClass && "Unexpected token");
+  Lex.Lex();  // Eat the multiclass token.
+
+  if (Lex.getCode() != tgtok::Id)
+    return TokError("expected identifier after multiclass for name");
+  std::string Name = Lex.getCurStrVal();
+
+  if (MultiClasses.count(Name))
+    return TokError("multiclass '" + Name + "' already defined");
+
+  CurMultiClass = MultiClasses[Name] = new MultiClass(Name, 
+                                                      Lex.getLoc(), Records);
+  Lex.Lex();  // Eat the identifier.
+
+  // If there are template args, parse them.
+  if (Lex.getCode() == tgtok::less)
+    if (ParseTemplateArgList(0))
+      return true;
+
+  bool inherits = false;
+
+  // If there are submulticlasses, parse them.
+  if (Lex.getCode() == tgtok::colon) {
+    inherits = true;
+
+    Lex.Lex();
+
+    // Read all of the submulticlasses.
+    SubMultiClassReference SubMultiClass =
+      ParseSubMultiClassReference(CurMultiClass);
+    while (1) {
+      // Check for error.
+      if (SubMultiClass.MC == 0) return true;
+
+      // Add it.
+      if (AddSubMultiClass(CurMultiClass, SubMultiClass))
+        return true;
+
+      if (Lex.getCode() != tgtok::comma) break;
+      Lex.Lex(); // eat ','.
+      SubMultiClass = ParseSubMultiClassReference(CurMultiClass);
+    }
+  }
+
+  if (Lex.getCode() != tgtok::l_brace) {
+    if (!inherits)
+      return TokError("expected '{' in multiclass definition");
+    else if (Lex.getCode() != tgtok::semi)
+      return TokError("expected ';' in multiclass definition");
+    else
+      Lex.Lex();  // eat the ';'.
+  } else {
+    if (Lex.Lex() == tgtok::r_brace)  // eat the '{'.
+      return TokError("multiclass must contain at least one def");
+
+    while (Lex.getCode() != tgtok::r_brace) {
+      switch (Lex.getCode()) {
+        default:
+          return TokError("expected 'let', 'def' or 'defm' in multiclass body");
+        case tgtok::Let:
+        case tgtok::Def:
+        case tgtok::Defm:
+          if (ParseObject(CurMultiClass))
+            return true;
+         break;
+      }
+    }
+    Lex.Lex();  // eat the '}'.
+  }
+
+  CurMultiClass = 0;
+  return false;
+}
+
+Record *TGParser::
+InstantiateMulticlassDef(MultiClass &MC,
+                         Record *DefProto,
+                         const std::string &DefmPrefix,
+                         SMLoc DefmPrefixLoc) {
+  // Add in the defm name.  If the defm prefix is empty, give each
+  // instantiated def a unique name.  Otherwise, if "#NAME#" exists in the
+  // name, substitute the prefix for #NAME#.  Otherwise, use the defm name
+  // as a prefix.
+  std::string DefName = DefProto->getName();
+  if (DefmPrefix.empty()) {
+    DefName = GetNewAnonymousName();
+  } else {
+    std::string::size_type idx = DefName.find("#NAME#");
+    if (idx != std::string::npos) {
+      DefName.replace(idx, 6, DefmPrefix);
+    } else {
+      // Add the suffix to the defm name to get the new name.
+      DefName = DefmPrefix + DefName;
+    }
+  }
+
+  Record *CurRec = new Record(DefName, DefmPrefixLoc, Records);
+
+  SubClassReference Ref;
+  Ref.RefLoc = DefmPrefixLoc;
+  Ref.Rec = DefProto;
+  AddSubClass(CurRec, Ref);
+
+  return CurRec;
+}
+
+bool TGParser::ResolveMulticlassDefArgs(MultiClass &MC,
+                                        Record *CurRec,
+                                        SMLoc DefmPrefixLoc,
+                                        SMLoc SubClassLoc,
+                                        const std::vector<std::string> &TArgs,
+                                        std::vector<Init *> &TemplateVals,
+                                        bool DeleteArgs) {
+  // Loop over all of the template arguments, setting them to the specified
+  // value or leaving them as the default if necessary.
+  for (unsigned i = 0, e = TArgs.size(); i != e; ++i) {
+    // Check if a value is specified for this temp-arg.
+    if (i < TemplateVals.size()) {
+      // Set it now.
+      if (SetValue(CurRec, DefmPrefixLoc, TArgs[i], std::vector<unsigned>(),
+                   TemplateVals[i]))
+        return true;
+        
+      // Resolve it next.
+      CurRec->resolveReferencesTo(CurRec->getValue(TArgs[i]));
+
+      if (DeleteArgs)
+        // Now remove it.
+        CurRec->removeValue(TArgs[i]);
+        
+    } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) {
+      return Error(SubClassLoc, "value not specified for template argument #"+
+                   utostr(i) + " (" + TArgs[i] + ") of multiclassclass '" +
+                   MC.Rec.getName() + "'");
+    }
+  }
+  return false;
+}
+
+bool TGParser::ResolveMulticlassDef(MultiClass &MC,
+                                    Record *CurRec,
+                                    Record *DefProto,
+                                    SMLoc DefmPrefixLoc) {
+  // If the mdef is inside a 'let' expression, add to each def.
+  for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
+    for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
+      if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
+                   LetStack[i][j].Bits, LetStack[i][j].Value))
+        return Error(DefmPrefixLoc, "when instantiating this defm");
+
+  // Ensure redefinition doesn't happen.
+  if (Records.getDef(CurRec->getName()))
+    return Error(DefmPrefixLoc, "def '" + CurRec->getName() + 
+                 "' already defined, instantiating defm with subdef '" + 
+                 DefProto->getName() + "'");
+
+  // Don't create a top level definition for defm inside multiclasses,
+  // instead, only update the prototypes and bind the template args
+  // with the new created definition.
+  if (CurMultiClass) {
+    for (unsigned i = 0, e = CurMultiClass->DefPrototypes.size();
+         i != e; ++i)
+      if (CurMultiClass->DefPrototypes[i]->getName() == CurRec->getName())
+        return Error(DefmPrefixLoc, "defm '" + CurRec->getName() +
+                     "' already defined in this multiclass!");
+    CurMultiClass->DefPrototypes.push_back(CurRec);
+
+    // Copy the template arguments for the multiclass into the new def.
+    const std::vector<std::string> &TA =
+      CurMultiClass->Rec.getTemplateArgs();
+
+    for (unsigned i = 0, e = TA.size(); i != e; ++i) {
+      const RecordVal *RV = CurMultiClass->Rec.getValue(TA[i]);
+      assert(RV && "Template arg doesn't exist?");
+      CurRec->addValue(*RV);
+    }
+  } else {
+    Records.addDef(CurRec);
+  }
+
+  return false;
+}
+
+/// ParseDefm - Parse the instantiation of a multiclass.
+///
+///   DefMInst ::= DEFM ID ':' DefmSubClassRef ';'
+///
+bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
+  assert(Lex.getCode() == tgtok::Defm && "Unexpected token!");
+
+  std::string DefmPrefix;
+  if (Lex.Lex() == tgtok::Id) {  // eat the defm.
+    DefmPrefix = Lex.getCurStrVal();
+    Lex.Lex();  // Eat the defm prefix.
+  }
+
+  SMLoc DefmPrefixLoc = Lex.getLoc();
+  if (Lex.getCode() != tgtok::colon)
+    return TokError("expected ':' after defm identifier");
+
+  // Keep track of the new generated record definitions.
+  std::vector<Record*> NewRecDefs;
+
+  // This record also inherits from a regular class (non-multiclass)?
+  bool InheritFromClass = false;
+
+  // eat the colon.
+  Lex.Lex();
+
+  SMLoc SubClassLoc = Lex.getLoc();
+  SubClassReference Ref = ParseSubClassReference(0, true);
+
+  while (1) {
+    if (Ref.Rec == 0) return true;
+
+    // To instantiate a multiclass, we need to first get the multiclass, then
+    // instantiate each def contained in the multiclass with the SubClassRef
+    // template parameters.
+    MultiClass *MC = MultiClasses[Ref.Rec->getName()];
+    assert(MC && "Didn't lookup multiclass correctly?");
+    std::vector<Init*> &TemplateVals = Ref.TemplateArgs;
+
+    // Verify that the correct number of template arguments were specified.
+    const std::vector<std::string> &TArgs = MC->Rec.getTemplateArgs();
+    if (TArgs.size() < TemplateVals.size())
+      return Error(SubClassLoc,
+                   "more template args specified than multiclass expects");
+
+    // Loop over all the def's in the multiclass, instantiating each one.
+    for (unsigned i = 0, e = MC->DefPrototypes.size(); i != e; ++i) {
+      Record *DefProto = MC->DefPrototypes[i];
+
+      Record *CurRec = InstantiateMulticlassDef(*MC, DefProto, DefmPrefix, DefmPrefixLoc);
+
+      if (ResolveMulticlassDefArgs(*MC, CurRec, DefmPrefixLoc, SubClassLoc,
+                                   TArgs, TemplateVals, true/*Delete args*/))
+        return Error(SubClassLoc, "could not instantiate def");
+
+      if (ResolveMulticlassDef(*MC, CurRec, DefProto, DefmPrefixLoc))
+        return Error(SubClassLoc, "could not instantiate def");
+
+      NewRecDefs.push_back(CurRec);
+    }
+
+
+    if (Lex.getCode() != tgtok::comma) break;
+    Lex.Lex(); // eat ','.
+
+    SubClassLoc = Lex.getLoc();
+
+    // A defm can inherit from regular classes (non-multiclass) as
+    // long as they come in the end of the inheritance list.
+    InheritFromClass = (Records.getClass(Lex.getCurStrVal()) != 0);
+
+    if (InheritFromClass)
+      break;
+
+    Ref = ParseSubClassReference(0, true);
+  }
+
+  if (InheritFromClass) {
+    // Process all the classes to inherit as if they were part of a
+    // regular 'def' and inherit all record values.
+    SubClassReference SubClass = ParseSubClassReference(0, false);
+    while (1) {
+      // Check for error.
+      if (SubClass.Rec == 0) return true;
+
+      // Get the expanded definition prototypes and teach them about
+      // the record values the current class to inherit has
+      for (unsigned i = 0, e = NewRecDefs.size(); i != e; ++i) {
+        Record *CurRec = NewRecDefs[i];
+
+        // Add it.
+        if (AddSubClass(CurRec, SubClass))
+          return true;
+
+        // Process any variables on the let stack.
+        for (unsigned i = 0, e = LetStack.size(); i != e; ++i)
+          for (unsigned j = 0, e = LetStack[i].size(); j != e; ++j)
+            if (SetValue(CurRec, LetStack[i][j].Loc, LetStack[i][j].Name,
+                         LetStack[i][j].Bits, LetStack[i][j].Value))
+              return true;
+      }
+
+      if (Lex.getCode() != tgtok::comma) break;
+      Lex.Lex(); // eat ','.
+      SubClass = ParseSubClassReference(0, false);
+    }
+  }
+
+  if (!CurMultiClass)
+    for (unsigned i = 0, e = NewRecDefs.size(); i != e; ++i)
+      // See Record::setName().  This resolve step will see any new
+      // name for the def that might have been created when resolving
+      // inheritance, values and arguments above.
+      NewRecDefs[i]->resolveReferences();
+
+  if (Lex.getCode() != tgtok::semi)
+    return TokError("expected ';' at end of defm");
+  Lex.Lex();
+
+  return false;
+}
+
+/// ParseObject
+///   Object ::= ClassInst
+///   Object ::= DefInst
+///   Object ::= MultiClassInst
+///   Object ::= DefMInst
+///   Object ::= LETCommand '{' ObjectList '}'
+///   Object ::= LETCommand Object
+bool TGParser::ParseObject(MultiClass *MC) {
+  switch (Lex.getCode()) {
+  default:
+    return TokError("Expected class, def, defm, multiclass or let definition");
+  case tgtok::Let:   return ParseTopLevelLet(MC);
+  case tgtok::Def:   return ParseDef(MC);
+  case tgtok::Defm:  return ParseDefm(MC);
+  case tgtok::Class: return ParseClass();
+  case tgtok::MultiClass: return ParseMultiClass();
+  }
+}
+
+/// ParseObjectList
+///   ObjectList :== Object*
+bool TGParser::ParseObjectList(MultiClass *MC) {
+  while (isObjectStart(Lex.getCode())) {
+    if (ParseObject(MC))
+      return true;
+  }
+  return false;
+}
+
+bool TGParser::ParseFile() {
+  Lex.Lex(); // Prime the lexer.
+  if (ParseObjectList()) return true;
+
+  // If we have unread input at the end of the file, report it.
+  if (Lex.getCode() == tgtok::Eof)
+    return false;
+
+  return TokError("Unexpected input at top level");
+}
+
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
new file mode 100644
index 0000000..db8a620
--- /dev/null
+++ b/lib/TableGen/TGParser.h
@@ -0,0 +1,137 @@
+//===- TGParser.h - Parser for TableGen Files -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class represents the Parser for tablegen files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TGPARSER_H
+#define TGPARSER_H
+
+#include "TGLexer.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/SourceMgr.h"
+#include <map>
+
+namespace llvm {
+  class Record;
+  class RecordVal;
+  class RecordKeeper;
+  class RecTy;
+  class Init;
+  struct MultiClass;
+  struct SubClassReference;
+  struct SubMultiClassReference;
+  
+  struct LetRecord {
+    std::string Name;
+    std::vector<unsigned> Bits;
+    Init *Value;
+    SMLoc Loc;
+    LetRecord(const std::string &N, const std::vector<unsigned> &B, Init *V,
+              SMLoc L)
+      : Name(N), Bits(B), Value(V), Loc(L) {
+    }
+  };
+  
+class TGParser {
+  TGLexer Lex;
+  std::vector<std::vector<LetRecord> > LetStack;
+  std::map<std::string, MultiClass*> MultiClasses;
+  
+  /// CurMultiClass - If we are parsing a 'multiclass' definition, this is the 
+  /// current value.
+  MultiClass *CurMultiClass;
+
+  // Record tracker
+  RecordKeeper &Records;
+public:
+  TGParser(SourceMgr &SrcMgr, RecordKeeper &records) : 
+    Lex(SrcMgr), CurMultiClass(0), Records(records) {}
+  
+  /// ParseFile - Main entrypoint for parsing a tblgen file.  These parser
+  /// routines return true on error, or false on success.
+  bool ParseFile();
+  
+  bool Error(SMLoc L, const Twine &Msg) const {
+    PrintError(L, Msg);
+    return true;
+  }
+  bool TokError(const Twine &Msg) const {
+    return Error(Lex.getLoc(), Msg);
+  }
+  const std::vector<std::string> &getDependencies() const {
+    return Lex.getDependencies();
+  }
+private:  // Semantic analysis methods.
+  bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
+  bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName, 
+                const std::vector<unsigned> &BitList, Init *V);
+  bool AddSubClass(Record *Rec, SubClassReference &SubClass);
+  bool AddSubMultiClass(MultiClass *CurMC,
+                        SubMultiClassReference &SubMultiClass);
+
+private:  // Parser methods.
+  bool ParseObjectList(MultiClass *MC = 0);
+  bool ParseObject(MultiClass *MC);
+  bool ParseClass();
+  bool ParseMultiClass();
+  Record *InstantiateMulticlassDef(MultiClass &MC,
+                                   Record *DefProto,
+                                   const std::string &DefmPrefix,
+                                   SMLoc DefmPrefixLoc);
+  bool ResolveMulticlassDefArgs(MultiClass &MC,
+                                Record *DefProto,
+                                SMLoc DefmPrefixLoc,
+                                SMLoc SubClassLoc,
+                                const std::vector<std::string> &TArgs,
+                                std::vector<Init *> &TemplateVals,
+                                bool DeleteArgs);
+  bool ResolveMulticlassDef(MultiClass &MC,
+                            Record *CurRec,
+                            Record *DefProto,
+                            SMLoc DefmPrefixLoc);
+  bool ParseDefm(MultiClass *CurMultiClass);
+  bool ParseDef(MultiClass *CurMultiClass);
+  bool ParseTopLevelLet(MultiClass *CurMultiClass);
+  std::vector<LetRecord> ParseLetList();
+
+  bool ParseObjectBody(Record *CurRec);
+  bool ParseBody(Record *CurRec);
+  bool ParseBodyItem(Record *CurRec);
+
+  bool ParseTemplateArgList(Record *CurRec);
+  std::string ParseDeclaration(Record *CurRec, bool ParsingTemplateArgs);
+
+  SubClassReference ParseSubClassReference(Record *CurRec, bool isDefm);
+  SubMultiClassReference ParseSubMultiClassReference(MultiClass *CurMC);
+
+  Init *ParseIDValue(Record *CurRec);
+  Init *ParseIDValue(Record *CurRec, const std::string &Name, SMLoc NameLoc);
+  Init *ParseSimpleValue(Record *CurRec, RecTy *ItemType = 0);
+  Init *ParseValue(Record *CurRec, RecTy *ItemType = 0);
+  std::vector<Init*> ParseValueList(Record *CurRec, Record *ArgsRec = 0, RecTy *EltTy = 0);
+  std::vector<std::pair<llvm::Init*, std::string> > ParseDagArgList(Record *);
+  bool ParseOptionalRangeList(std::vector<unsigned> &Ranges);
+  bool ParseOptionalBitList(std::vector<unsigned> &Ranges);
+  std::vector<unsigned> ParseRangeList();
+  bool ParseRangePiece(std::vector<unsigned> &Ranges);
+  RecTy *ParseType();
+  Init *ParseOperation(Record *CurRec);
+  RecTy *ParseOperatorType();
+  std::string ParseObjectName();
+  Record *ParseClassID();
+  MultiClass *ParseMultiClassID();
+  Record *ParseDefmID();
+};
+  
+} // end namespace llvm
+
+#endif
diff --git a/lib/TableGen/TableGenBackend.cpp b/lib/TableGen/TableGenBackend.cpp
new file mode 100644
index 0000000..29588db
--- /dev/null
+++ b/lib/TableGen/TableGenBackend.cpp
@@ -0,0 +1,25 @@
+//===- TableGenBackend.cpp - Base class for TableGen Backends ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides useful services for TableGen backends...
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TableGen/TableGenBackend.h"
+#include "llvm/TableGen/Record.h"
+using namespace llvm;
+
+void TableGenBackend::EmitSourceFileHeader(const std::string &Desc,
+                                           raw_ostream &OS) const {
+  OS << "//===- TableGen'erated file -------------------------------------*-"
+       " C++ -*-===//\n//\n// " << Desc << "\n//\n// Automatically generate"
+       "d file, do not edit!\n//\n//===------------------------------------"
+       "----------------------------------===//\n\n";
+}
+
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 08dc340..16d0da3 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -15,7 +15,7 @@
 #ifndef TARGET_ARM_H
 #define TARGET_ARM_H
 
-#include "ARMBaseInfo.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,19 +29,7 @@ class ARMBaseTargetMachine;
 class FunctionPass;
 class JITCodeEmitter;
 class MachineInstr;
-class MCCodeEmitter;
 class MCInst;
-class MCInstrInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
-class TargetAsmBackend;
-class formatted_raw_ostream;
-
-MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCSubtargetInfo &STI,
-                                      MCContext &Ctx);
-
-TargetAsmBackend *createARMAsmBackend(const Target &, const std::string &);
 
 FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
@@ -53,7 +41,6 @@ FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
 FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
 FunctionPass *createARMConstantIslandPass();
-FunctionPass *createNEONMoveFixPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createThumb2SizeReductionPass();
@@ -61,12 +48,6 @@ FunctionPass *createThumb2SizeReductionPass();
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
-/// createARMMachObjectWriter - Construct an ARM Mach-O object writer.
-MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
-                                          bool Is64Bit,
-                                          uint32_t CPUType,
-                                          uint32_t CPUSubtype);
-
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index cf333cc..5c727ad 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -23,6 +23,9 @@ include "llvm/Target/Target.td"
 def ModeThumb  : SubtargetFeature<"thumb-mode", "InThumbMode", "true",
                                   "Thumb mode">;
 
+def ModeNaCl   : SubtargetFeature<"nacl-mode", "InNaClMode", "true",
+                                  "Native client mode">;
+
 //===----------------------------------------------------------------------===//
 // ARM Subtarget features.
 //
@@ -85,12 +88,16 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
 
 /// Some M architectures don't have the DSP extension (v7E-M vs. v7M)
 def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
-                                 "Supports v7 DSP instructions in Thumb2.">;
+                                 "Supports v7 DSP instructions in Thumb2">;
 
 // Multiprocessing extension.
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
                                  "Supports Multiprocessing extension">;
 
+// M-series ISA?
+def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true",
+                                     "Is microcontroller profile ('M' series)">;
+
 // ARM ISAs.
 def HasV4TOps   : SubtargetFeature<"v4t", "HasV4TOps", "true",
                                    "Support ARM v4T instructions">;
@@ -105,7 +112,7 @@ def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    [HasV5TEOps]>;
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
-                                   [HasV6Ops, FeatureThumb2, FeatureDSPThumb2]>;
+                                   [HasV6Ops, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops]>;
@@ -182,12 +189,14 @@ def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
 
 // V6M Processors.
 def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6Ops, FeatureNoARM,
-                                                       FeatureDB]>;
+                                                       FeatureDB, FeatureMClass]>;
 
 // V6T2 Processors.
-def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops]>;
+def : Processor<"arm1156t2-s",      ARMV6Itineraries, [HasV6T2Ops,
+                                                       FeatureDSPThumb2]>;
 def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
-                                                       FeatureHasSlowFPVMLx]>;
+                                                       FeatureHasSlowFPVMLx,
+                                                       FeatureDSPThumb2]>;
 
 // V7a Processors.
 def : Processor<"cortex-a8",        CortexA8Itineraries,
@@ -203,14 +212,14 @@ def : Processor<"cortex-a9-mp",     CortexA9Itineraries,
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
-                                     FeatureHWDiv]>;
+                                     FeatureHWDiv, FeatureMClass]>;
 
 // V7EM Processors.
 def : ProcNoItin<"cortex-m4",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
                                      FeatureHWDiv, FeatureDSPThumb2,
                                      FeatureT2XtPk, FeatureVFP2,
-                                     FeatureVFPOnlySP]>;
+                                     FeatureVFPOnlySP, FeatureMClass]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index dbc3ee4..ea3319f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -15,15 +15,15 @@
 #define DEBUG_TYPE "asm-printer"
 #include "ARM.h"
 #include "ARMAsmPrinter.h"
-#include "ARMAddressingModes.h"
 #include "ARMBuildAttrs.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
-#include "ARMMCExpr.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
 #include "InstPrinter/ARMInstPrinter.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
@@ -45,13 +45,13 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 using namespace llvm;
@@ -92,7 +92,7 @@ namespace {
       case ARMBuildAttrs::Advanced_SIMD_arch:
       case ARMBuildAttrs::VFP_arch:
         Streamer.EmitRawText(StringRef("\t.fpu ") + LowercaseString(String));
-        break;    
+        break;
       default: assert(0 && "Unsupported Text attribute in ASM Mode"); break;
       }
     }
@@ -100,13 +100,41 @@ namespace {
   };
 
   class ObjectAttributeEmitter : public AttributeEmitter {
+    // This structure holds all attributes, accounting for
+    // their string/numeric value, so we can later emmit them
+    // in declaration order, keeping all in the same vector
+    struct AttributeItemType {
+      enum {
+        HiddenAttribute = 0,
+        NumericAttribute,
+        TextAttribute
+      } Type;
+      unsigned Tag;
+      unsigned IntValue;
+      StringRef StringValue;
+    } AttributeItem;
+
     MCObjectStreamer &Streamer;
     StringRef CurrentVendor;
-    SmallString<64> Contents;
+    SmallVector<AttributeItemType, 64> Contents;
+
+    // Account for the ULEB/String size of each item,
+    // not just the number of items
+    size_t ContentsSize;
+    // FIXME: this should be in a more generic place, but
+    // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
+    size_t getULEBSize(int Value) {
+      size_t Size = 0;
+      do {
+        Value >>= 7;
+        Size += sizeof(int8_t); // Is this really necessary?
+      } while (Value);
+      return Size;
+    }
 
   public:
     ObjectAttributeEmitter(MCObjectStreamer &Streamer_) :
-      Streamer(Streamer_), CurrentVendor("") { }
+      Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { }
 
     void MaybeSwitchVendor(StringRef Vendor) {
       assert(!Vendor.empty() && "Vendor cannot be empty.");
@@ -124,20 +152,32 @@ namespace {
     }
 
     void EmitAttribute(unsigned Attribute, unsigned Value) {
-      // FIXME: should be ULEB
-      Contents += Attribute;
-      Contents += Value;
+      AttributeItemType attr = {
+        AttributeItemType::NumericAttribute,
+        Attribute,
+        Value,
+        StringRef("")
+      };
+      ContentsSize += getULEBSize(Attribute);
+      ContentsSize += getULEBSize(Value);
+      Contents.push_back(attr);
     }
 
     void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      Contents += Attribute;
-      Contents += UppercaseString(String);
-      Contents += 0;
+      AttributeItemType attr = {
+        AttributeItemType::TextAttribute,
+        Attribute,
+        0,
+        String
+      };
+      ContentsSize += getULEBSize(Attribute);
+      // String + \0
+      ContentsSize += String.size()+1;
+
+      Contents.push_back(attr);
     }
 
     void Finish() {
-      const size_t ContentsSize = Contents.size();
-
       // Vendor size + Vendor name + '\0'
       const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
 
@@ -151,7 +191,23 @@ namespace {
       Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
       Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
 
-      Streamer.EmitBytes(Contents, 0);
+      // Size should have been accounted for already, now
+      // emit each field as its type (ULEB or String)
+      for (unsigned int i=0; i<Contents.size(); ++i) {
+        AttributeItemType item = Contents[i];
+        Streamer.EmitULEB128IntValue(item.Tag, 0);
+        switch (item.Type) {
+        case AttributeItemType::NumericAttribute:
+          Streamer.EmitULEB128IntValue(item.IntValue, 0);
+          break;
+        case AttributeItemType::TextAttribute:
+          Streamer.EmitBytes(UppercaseString(item.StringValue), 0);
+          Streamer.EmitIntValue(0, 1); // '\0'
+          break;
+        default:
+          assert(0 && "Invalid attribute type");
+        }
+      }
 
       Contents.clear();
     }
@@ -184,7 +240,7 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
       // S registers are described as bit-pieces of a register
       // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
       // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
-      
+
       unsigned SReg = Reg - ARM::S0;
       bool odd = SReg & 0x1;
       unsigned Rx = 256 + (SReg >> 1);
@@ -209,12 +265,13 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
     } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
       assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
       // Q registers Q0-Q15 are described by composing two D registers together.
-      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1) DW_OP_piece(8)
+      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
+      // DW_OP_piece(8)
 
       unsigned QReg = Reg - ARM::Q0;
       unsigned D1 = 256 + 2 * QReg;
       unsigned D2 = D1 + 1;
-      
+
       OutStreamer.AddComment("DW_OP_regx for Q register: D1");
       EmitInt8(dwarf::DW_OP_regx);
       EmitULEB128(D1);
@@ -233,6 +290,8 @@ void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
 }
 
 void ARMAsmPrinter::EmitFunctionEntryLabel() {
+  OutStreamer.ForceCodeRegion();
+
   if (AFI->isThumbFunction()) {
     OutStreamer.EmitAssemblerFlag(MCAF_Code16);
     OutStreamer.EmitThumbFunc(CurrentFnSym);
@@ -395,16 +454,16 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       // This takes advantage of the 2 operand-ness of ldm/stm and that we've
       // already got the operands in registers that are operands to the
       // inline asm statement.
-      
+
       O << "{" << ARMInstPrinter::getRegisterName(RegBegin);
-      
+
       // FIXME: The register allocator not only may not have given us the
       // registers in sequence, but may not be in ascending registers. This
       // will require changes in the register allocator that'll need to be
       // propagated down here if the operands change.
       unsigned RegOps = OpNum + 1;
       while (MI->getOperand(RegOps).isReg()) {
-        O << ", " 
+        O << ", "
           << ARMInstPrinter::getRegisterName(MI->getOperand(RegOps).getReg());
         RegOps++;
       }
@@ -413,14 +472,34 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
 
       return false;
     }
+    case 'R': // The most significant register of a pair.
+    case 'Q': { // The least significant register of a pair.
+      if (OpNum == 0)
+        return true;
+      const MachineOperand &FlagsOP = MI->getOperand(OpNum - 1);
+      if (!FlagsOP.isImm())
+        return true;
+      unsigned Flags = FlagsOP.getImm();
+      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      if (NumVals != 2)
+        return true;
+      unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
+      if (RegOp >= MI->getNumOperands())
+        return true;
+      const MachineOperand &MO = MI->getOperand(RegOp);
+      if (!MO.isReg())
+        return true;
+      unsigned Reg = MO.getReg();
+      O << ARMInstPrinter::getRegisterName(Reg);
+      return false;
+    }
+
     // These modifiers are not yet supported.
     case 'p': // The high single-precision register of a VFP double-precision
               // register.
     case 'e': // The low doubleword register of a NEON quad register.
     case 'f': // The high doubleword register of a NEON quad register.
     case 'h': // A range of VFP/NEON registers suitable for VLD1/VST1.
-    case 'Q': // The least significant register of a pair.
-    case 'R': // The most significant register of a pair.
     case 'H': // The highest-numbered register of a pair.
       return true;
     }
@@ -437,7 +516,7 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
-    
+
     switch (ExtraCode[0]) {
       case 'A': // A memory operand for a VLD1/VST1 instruction.
       default: return true;  // Unknown modifier.
@@ -448,7 +527,7 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
         return false;
     }
   }
-  
+
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "[" << ARMInstPrinter::getRegisterName(MO.getReg()) << "]";
@@ -772,13 +851,19 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     OS << MAI->getPrivateGlobalPrefix() << "_LSDA_" << getFunctionNumber();
     MCSym = OutContext.GetOrCreateSymbol(OS.str());
   } else if (ACPV->isBlockAddress()) {
-    MCSym = GetBlockAddressSymbol(ACPV->getBlockAddress());
+    const BlockAddress *BA =
+      cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress();
+    MCSym = GetBlockAddressSymbol(BA);
   } else if (ACPV->isGlobalValue()) {
-    const GlobalValue *GV = ACPV->getGV();
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
     MCSym = GetARMGVSymbol(GV);
+  } else if (ACPV->isMachineBasicBlock()) {
+    const MachineBasicBlock *MBB = cast<ARMConstantPoolMBB>(ACPV)->getMBB();
+    MCSym = MBB->getSymbol();
   } else {
     assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
-    MCSym = GetExternalSymbolSymbol(ACPV->getSymbol());
+    const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+    MCSym = GetExternalSymbolSymbol(Sym);
   }
 
   // Create an MCSymbol for the reference.
@@ -822,6 +907,9 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
   const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
   unsigned JTI = MO1.getIndex();
 
+  // Tag the jump table appropriately for precise disassembly.
+  OutStreamer.EmitJumpTable32Region();
+
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
@@ -847,6 +935,11 @@ void ARMAsmPrinter::EmitJumpTable(const MachineInstr *MI) {
       Expr = MCBinaryExpr::CreateSub(Expr, MCSymbolRefExpr::Create(JTISymbol,
                                                                    OutContext),
                                      OutContext);
+    // If we're generating a table of Thumb addresses in static relocation
+    // model, we need to add one to keep interworking correctly.
+    else if (AFI->isThumbFunction())
+      Expr = MCBinaryExpr::CreateAdd(Expr, MCConstantExpr::Create(1,OutContext),
+                                     OutContext);
     OutStreamer.EmitValue(Expr, 4);
   }
 }
@@ -859,6 +952,14 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
   unsigned JTI = MO1.getIndex();
 
   // Emit a label for the jump table.
+  if (MI->getOpcode() == ARM::t2TBB_JT) {
+    OutStreamer.EmitJumpTable8Region();
+  } else if (MI->getOpcode() == ARM::t2TBH_JT) {
+    OutStreamer.EmitJumpTable16Region();
+  } else {
+    OutStreamer.EmitJumpTable32Region();
+  }
+
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
   OutStreamer.EmitLabel(JTISymbol);
 
@@ -881,6 +982,8 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
       MCInst BrInst;
       BrInst.setOpcode(ARM::t2B);
       BrInst.addOperand(MCOperand::CreateExpr(MBBSymbolExpr));
+      BrInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      BrInst.addOperand(MCOperand::CreateReg(0));
       OutStreamer.EmitInstruction(BrInst);
       continue;
     }
@@ -994,7 +1097,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
            i != NumOps; ++i)
         RegList.push_back(MI->getOperand(i).getReg());
       break;
-    case ARM::STR_PRE:
+    case ARM::STR_PRE_IMM:
+    case ARM::STR_PRE_REG:
       assert(MI->getOperand(2).getReg() == ARM::SP &&
              "Only stack pointer as a source reg is supported");
       RegList.push_back(SrcReg);
@@ -1074,10 +1178,20 @@ extern cl::opt<bool> EnableARMEHABI;
 #include "ARMGenMCPseudoLowering.inc"
 
 void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (MI->getOpcode() != ARM::CONSTPOOL_ENTRY)
+    OutStreamer.EmitCodeRegion();
+
+  // Emit unwinding stuff for frame-related instructions
+  if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
+    EmitUnwindingInstruction(MI);
+
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(OutStreamer, MI))
     return;
 
+  assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+         "Pseudo flag setting opcode should be expanded early");
+
   // Check for manual lowerings.
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
@@ -1372,6 +1486,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
 
     EmitAlignment(2);
+
+    // Mark the constant pool entry as data if we're not already in a data
+    // region.
+    OutStreamer.EmitDataRegion();
     OutStreamer.EmitLabel(GetCPISymbol(LabelId));
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
@@ -1379,7 +1497,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     else
       EmitGlobalConstant(MCPE.Val.ConstVal);
-
     return;
   }
   case ARM::t2BR_JT: {
@@ -1590,6 +1707,8 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCInst TmpInst;
       TmpInst.setOpcode(ARM::tB);
       TmpInst.addOperand(MCOperand::CreateExpr(SymbolExpr));
+      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
+      TmpInst.addOperand(MCOperand::CreateReg(0));
       OutStreamer.EmitInstruction(TmpInst);
     }
     {
@@ -1804,10 +1923,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
   LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
 
-  // Emit unwinding stuff for frame-related instructions
-  if (EnableARMEHABI && MI->getFlag(MachineInstr::FrameSetup))
-    EmitUnwindingInstruction(MI);
-
   OutStreamer.EmitInstruction(TmpInst);
 }
 
@@ -1815,20 +1930,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 // Target Registry Stuff
 //===----------------------------------------------------------------------===//
 
-static MCInstPrinter *createARMMCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new ARMInstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeARMAsmPrinter() {
   RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
   RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
-
-  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
 }
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 649bd7d..408edfc 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -13,11 +13,11 @@
 
 #include "ARMBaseInstrInfo.h"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalValue.h"
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
@@ -45,6 +46,10 @@ static cl::opt<bool>
 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
 
+static cl::opt<bool>
+WidenVMOVS("widen-vmovs", cl::Hidden,
+           cl::desc("Widen ARM vmovs to vmovd when possible"));
+
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
 struct ARM_MLxEntry {
   unsigned MLxOpc;     // MLA / MLS opcode
@@ -171,7 +176,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
       unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
       UpdateMI = BuildMI(MF, MI->getDebugLoc(),
-                         get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+                         get(isSub ? ARM::SUBrsi : ARM::ADDrsi), WBReg)
         .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
         .addImm(Pred).addReg(0).addReg(0);
     } else
@@ -399,6 +404,7 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
     ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
   int BccOpc = !AFI->isThumbFunction()
     ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+  bool isThumb = AFI->isThumbFunction() || AFI->isThumb2Function();
 
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
@@ -406,9 +412,12 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
          "ARM branch conditions have two components!");
 
   if (FBB == 0) {
-    if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
-    else
+    if (Cond.empty()) { // Unconditional branch?
+      if (isThumb)
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
+      else
+        BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
+    } else
       BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
         .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
     return 1;
@@ -417,7 +426,10 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   // Two-way conditional branch.
   BuildMI(&MBB, DL, get(BccOpc)).addMBB(TBB)
     .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
-  BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
+  if (isThumb)
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB);
   return 2;
 }
 
@@ -627,7 +639,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   bool SPRDest = ARM::SPRRegClass.contains(DestReg);
   bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
 
-  unsigned Opc;
+  unsigned Opc = 0;
   if (SPRDest && SPRSrc)
     Opc = ARM::VMOVS;
   else if (GPRDest && SPRSrc)
@@ -638,19 +650,40 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = ARM::VMOVD;
   else if (ARM::QPRRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VORRq;
-  else if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVQQ;
-  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVQQQQ;
-  else
-    llvm_unreachable("Impossible reg-to-reg copy");
 
-  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
-  MIB.addReg(SrcReg, getKillRegState(KillSrc));
-  if (Opc == ARM::VORRq)
+  if (Opc) {
+    MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg);
     MIB.addReg(SrcReg, getKillRegState(KillSrc));
-  if (Opc != ARM::VMOVQQ && Opc != ARM::VMOVQQQQ)
+    if (Opc == ARM::VORRq)
+      MIB.addReg(SrcReg, getKillRegState(KillSrc));
     AddDefaultPred(MIB);
+    return;
+  }
+
+  // Generate instructions for VMOVQQ and VMOVQQQQ pseudos in place.
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg) ||
+      ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    assert(ARM::qsub_0 + 3 == ARM::qsub_3 && "Expected contiguous enum.");
+    unsigned EndSubReg = ARM::QQPRRegClass.contains(DestReg, SrcReg) ?
+      ARM::qsub_1 : ARM::qsub_3;
+    for (unsigned i = ARM::qsub_0, e = EndSubReg + 1; i != e; ++i) {
+      unsigned Dst = TRI->getSubReg(DestReg, i);
+      unsigned Src = TRI->getSubReg(SrcReg, i);
+      MachineInstrBuilder Mov =
+        AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VORRq))
+                       .addReg(Dst, RegState::Define)
+                       .addReg(Src, getKillRegState(KillSrc))
+                       .addReg(Src, getKillRegState(KillSrc)));
+      if (i == EndSubReg) {
+        Mov->addRegisterDefined(DestReg, TRI);
+        if (KillSrc)
+          Mov->addRegisterKilled(SrcReg, TRI);
+      }
+    }
+    return;
+  }
+  llvm_unreachable("Impossible reg-to-reg copy");
 }
 
 static const
@@ -683,82 +716,84 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             MFI.getObjectSize(FI),
                             Align);
 
-  // tGPR is used sometimes in ARM instructions that need to avoid using
-  // certain registers.  Just treat it as GPR here. Likewise, rGPR.
-  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass
-      || RC == ARM::rGPRRegisterClass)
-    RC = ARM::GPRRegisterClass;
-
-  switch (RC->getID()) {
-  case ARM::GPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
+  switch (RC->getSize()) {
+    case 4:
+      if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STRi12))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::SPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
+      } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::DPRRegClassID:
-  case ARM::DPR_VFP2RegClassID:
-  case ARM::DPR_8RegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 8:
+      if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::QPRRegClassID:
-  case ARM::QPR_VFP2RegClassID:
-  case ARM::QPR_8RegClassID:
-    if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64Pseudo))
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 16:
+      if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+        if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64Pseudo))
                      .addFrameIndex(FI).addImm(16)
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addMemOperand(MMO));
-    } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
+        } else {
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMQIA))
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addFrameIndex(FI)
                      .addMemOperand(MMO));
-    }
-    break;
-  case ARM::QQPRRegClassID:
-  case ARM::QQPR_VFP2RegClassID:
-    if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      // FIXME: It's possible to only store part of the QQ register if the
-      // spilled def has a sub-register index.
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 32:
+      if (ARM::QQPRRegClass.hasSubClassEq(RC)) {
+        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+          // FIXME: It's possible to only store part of the QQ register if the
+          // spilled def has a sub-register index.
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1d64QPseudo))
                      .addFrameIndex(FI).addImm(16)
                      .addReg(SrcReg, getKillRegState(isKill))
                      .addMemOperand(MMO));
-    } else {
-      MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+        } else {
+          MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
                        .addFrameIndex(FI))
-        .addMemOperand(MMO);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-      MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-            AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-    }
-    break;
-  case ARM::QQQQPRRegClassID: {
-    MachineInstrBuilder MIB =
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
-                     .addFrameIndex(FI))
-      .addMemOperand(MMO);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
-    MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
-          AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
-    break;
-  }
-  default:
-    llvm_unreachable("Unknown regclass!");
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+          MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+                AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        }
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    case 64:
+      if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTMDIA))
+                         .addFrameIndex(FI))
+                         .addMemOperand(MMO);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_0, getKillRegState(isKill), TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_1, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_2, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_3, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_4, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_5, 0, TRI);
+        MIB = AddDReg(MIB, SrcReg, ARM::dsub_6, 0, TRI);
+              AddDReg(MIB, SrcReg, ARM::dsub_7, 0, TRI);
+      } else
+        llvm_unreachable("Unknown reg class!");
+      break;
+    default:
+      llvm_unreachable("Unknown reg class!");
   }
 }
 
@@ -809,6 +844,12 @@ ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return 0;
 }
 
+unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->getDesc().mayStore() && hasStoreToStackSlot(MI, Dummy, FrameIndex);
+}
+
 void ARMBaseInstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
@@ -826,72 +867,77 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                             MFI.getObjectSize(FI),
                             Align);
 
-  // tGPR is used sometimes in ARM instructions that need to avoid using
-  // certain registers.  Just treat it as GPR here.
-  if (RC == ARM::tGPRRegisterClass || RC == ARM::tcGPRRegisterClass
-      || RC == ARM::rGPRRegisterClass)
-    RC = ARM::GPRRegisterClass;
-
-  switch (RC->getID()) {
-  case ARM::GPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
+  switch (RC->getSize()) {
+  case 4:
+    if (ARM::GPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDRi12), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
-    break;
-  case ARM::SPRRegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+
+    } else if (ARM::SPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::DPRRegClassID:
-  case ARM::DPR_VFP2RegClassID:
-  case ARM::DPR_8RegClassID:
-    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+  case 8:
+    if (ARM::DPRRegClass.hasSubClassEq(RC)) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::QPRRegClassID:
-  case ARM::QPR_VFP2RegClassID:
-  case ARM::QPR_8RegClassID:
-    if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64Pseudo), DestReg)
+  case 16:
+    if (ARM::QPRRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().needsStackRealignment(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64Pseudo), DestReg)
                      .addFrameIndex(FI).addImm(16)
                      .addMemOperand(MMO));
-    } else {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
-                     .addFrameIndex(FI)
-                     .addMemOperand(MMO));
-    }
+      } else {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMQIA), DestReg)
+                       .addFrameIndex(FI)
+                       .addMemOperand(MMO));
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::QQPRRegClassID:
-  case ARM::QQPR_VFP2RegClassID:
-    if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
-      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+  case 32:
+    if (ARM::QQPRRegClass.hasSubClassEq(RC)) {
+      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
                      .addFrameIndex(FI).addImm(16)
                      .addMemOperand(MMO));
-    } else {
-      MachineInstrBuilder MIB =
+      } else {
+        MachineInstrBuilder MIB =
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
                        .addFrameIndex(FI))
-        .addMemOperand(MMO);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
-      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
-            AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-    }
+                       .addMemOperand(MMO);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
+        MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
+        MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
+      }
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  case ARM::QQQQPRRegClassID: {
-    MachineInstrBuilder MIB =
+  case 64:
+    if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+      MachineInstrBuilder MIB =
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
                      .addFrameIndex(FI))
-      .addMemOperand(MMO);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
-    MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
-    AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+                     .addMemOperand(MMO);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_4, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_5, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_6, RegState::Define, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::dsub_7, RegState::Define, TRI);
+      MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
+    } else
+      llvm_unreachable("Unknown reg class!");
     break;
-  }
   default:
     llvm_unreachable("Unknown regclass!");
   }
@@ -944,6 +990,78 @@ ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   return 0;
 }
 
+unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  const MachineMemOperand *Dummy;
+  return MI->getDesc().mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+}
+
+bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
+  // This hook gets to expand COPY instructions before they become
+  // copyPhysReg() calls.  Look for VMOVS instructions that can legally be
+  // widened to VMOVD.  We prefer the VMOVD when possible because it may be
+  // changed into a VORR that can go down the NEON pipeline.
+  if (!WidenVMOVS || !MI->isCopy())
+    return false;
+
+  // Look for a copy between even S-registers.  That is where we keep floats
+  // when using NEON v2f32 instructions for f32 arithmetic.
+  unsigned DstRegS = MI->getOperand(0).getReg();
+  unsigned SrcRegS = MI->getOperand(1).getReg();
+  if (!ARM::SPRRegClass.contains(DstRegS, SrcRegS))
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  unsigned DstRegD = TRI->getMatchingSuperReg(DstRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  unsigned SrcRegD = TRI->getMatchingSuperReg(SrcRegS, ARM::ssub_0,
+                                              &ARM::DPRRegClass);
+  if (!DstRegD || !SrcRegD)
+    return false;
+
+  // We want to widen this into a DstRegD = VMOVD SrcRegD copy.  This is only
+  // legal if the COPY already defines the full DstRegD, and it isn't a
+  // sub-register insertion.
+  if (!MI->definesRegister(DstRegD, TRI) || MI->readsRegister(DstRegD, TRI))
+    return false;
+
+  // A dead copy shouldn't show up here, but reject it just in case.
+  if (MI->getOperand(0).isDead())
+    return false;
+
+  // All clear, widen the COPY.
+  DEBUG(dbgs() << "widening:    " << *MI);
+
+  // Get rid of the old <imp-def> of DstRegD.  Leave it if it defines a Q-reg
+  // or some other super-register.
+  int ImpDefIdx = MI->findRegisterDefOperandIdx(DstRegD);
+  if (ImpDefIdx != -1)
+    MI->RemoveOperand(ImpDefIdx);
+
+  // Change the opcode and operands.
+  MI->setDesc(get(ARM::VMOVD));
+  MI->getOperand(0).setReg(DstRegD);
+  MI->getOperand(1).setReg(SrcRegD);
+  AddDefaultPred(MachineInstrBuilder(MI));
+
+  // We are now reading SrcRegD instead of SrcRegS.  This may upset the
+  // register scavenger and machine verifier, so we need to indicate that we
+  // are reading an undefined value from SrcRegD, but a proper value from
+  // SrcRegS.
+  MI->getOperand(1).setIsUndef();
+  MachineInstrBuilder(MI).addReg(SrcRegS, RegState::Implicit);
+
+  // SrcRegD may actually contain an unrelated value in the ssub_1
+  // sub-register.  Don't kill it.  Only kill the ssub_0 sub-register.
+  if (MI->getOperand(1).isKill()) {
+    MI->getOperand(1).setIsKill(false);
+    MI->addRegisterKilled(SrcRegS, TRI, true);
+  }
+
+  DEBUG(dbgs() << "replaced by: " << *MI);
+  return true;
+}
+
 MachineInstr*
 ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
                                            int FrameIx, uint64_t Offset,
@@ -974,17 +1092,24 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
   // instructions, so that's probably OK, but is PIC always correct when
   // we get here?
   if (ACPV->isGlobalValue())
-    NewCPV = new ARMConstantPoolValue(ACPV->getGV(), PCLabelId,
-                                      ARMCP::CPValue, 4);
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId,
+             ARMCP::CPValue, 4);
   else if (ACPV->isExtSymbol())
-    NewCPV = new ARMConstantPoolValue(MF.getFunction()->getContext(),
-                                      ACPV->getSymbol(), PCLabelId, 4);
+    NewCPV = ARMConstantPoolSymbol::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolSymbol>(ACPV)->getSymbol(), PCLabelId, 4);
   else if (ACPV->isBlockAddress())
-    NewCPV = new ARMConstantPoolValue(ACPV->getBlockAddress(), PCLabelId,
-                                      ARMCP::CPBlockAddress, 4);
+    NewCPV = ARMConstantPoolConstant::
+      Create(cast<ARMConstantPoolConstant>(ACPV)->getBlockAddress(), PCLabelId,
+             ARMCP::CPBlockAddress, 4);
   else if (ACPV->isLSDA())
-    NewCPV = new ARMConstantPoolValue(MF.getFunction(), PCLabelId,
-                                      ARMCP::CPLSDA, 4);
+    NewCPV = ARMConstantPoolConstant::Create(MF.getFunction(), PCLabelId,
+                                             ARMCP::CPLSDA, 4);
+  else if (ACPV->isMachineBasicBlock())
+    NewCPV = ARMConstantPoolMBB::
+      Create(MF.getFunction()->getContext(),
+             cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
   else
     llvm_unreachable("Unexpected ARM constantpool value type!!");
   CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
@@ -1289,7 +1414,7 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
   // Attempt to estimate the relative costs of predication versus branching.
   unsigned TUnpredCost = Probability.getNumerator() * TCycles;
   TUnpredCost /= Probability.getDenominator();
-    
+
   uint32_t Comp = Probability.getDenominator() - Probability.getNumerator();
   unsigned FUnpredCost = Comp * FCycles;
   FUnpredCost /= Probability.getDenominator();
@@ -1330,6 +1455,57 @@ int llvm::getMatchingCondBranchOpcode(int Opc) {
 }
 
 
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether the
+/// instruction is encoded with an 'S' bit is determined by the optional CPSR
+/// def operand.
+///
+/// This will go away once we can teach tblgen how to set the optional CPSR def
+/// operand itself.
+struct AddSubFlagsOpcodePair {
+  unsigned PseudoOpc;
+  unsigned MachineOpc;
+};
+
+static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
+  {ARM::ADDSri, ARM::ADDri},
+  {ARM::ADDSrr, ARM::ADDrr},
+  {ARM::ADDSrsi, ARM::ADDrsi},
+  {ARM::ADDSrsr, ARM::ADDrsr},
+
+  {ARM::SUBSri, ARM::SUBri},
+  {ARM::SUBSrr, ARM::SUBrr},
+  {ARM::SUBSrsi, ARM::SUBrsi},
+  {ARM::SUBSrsr, ARM::SUBrsr},
+
+  {ARM::RSBSri, ARM::RSBri},
+  {ARM::RSBSrr, ARM::RSBrr},
+  {ARM::RSBSrsi, ARM::RSBrsi},
+  {ARM::RSBSrsr, ARM::RSBrsr},
+
+  {ARM::t2ADDSri, ARM::t2ADDri},
+  {ARM::t2ADDSrr, ARM::t2ADDrr},
+  {ARM::t2ADDSrs, ARM::t2ADDrs},
+
+  {ARM::t2SUBSri, ARM::t2SUBri},
+  {ARM::t2SUBSrr, ARM::t2SUBrr},
+  {ARM::t2SUBSrs, ARM::t2SUBrs},
+
+  {ARM::t2RSBSri, ARM::t2RSBri},
+  {ARM::t2RSBSrs, ARM::t2RSBrs},
+};
+
+unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
+  static const int NPairs =
+    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
+  for (AddSubFlagsOpcodePair *OpcPair = &AddSubFlagsOpcodeMap[0],
+         *End = &AddSubFlagsOpcodeMap[NPairs]; OpcPair != End; ++OpcPair) {
+    if (OldOpc == OpcPair->PseudoOpc) {
+      return OpcPair->MachineOpc;
+    }
+  }
+  return 0;
+}
+
 void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI, DebugLoc dl,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
@@ -1862,7 +2038,6 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::STMIB_UPD:
   case ARM::tLDMIA:
   case ARM::tLDMIA_UPD:
-  case ARM::tSTMIA:
   case ARM::tSTMIA_UPD:
   case ARM::tPOP_RET:
   case ARM::tPOP:
@@ -2128,7 +2303,6 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   case ARM::STMDA_UPD:
   case ARM::STMDB_UPD:
   case ARM::STMIB_UPD:
-  case ARM::tSTMIA:
   case ARM::tSTMIA_UPD:
   case ARM::tPOP_RET:
   case ARM::tPOP:
@@ -2567,6 +2741,15 @@ hasLowDefLatency(const InstrItineraryData *ItinData,
   return false;
 }
 
+bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                         StringRef &ErrInfo) const {
+  if (convertAddSubFlagsOpcode(MI->getOpcode())) {
+    ErrInfo = "Pseudo flag setting opcodes only exist in Selection DAG";
+    return false;
+  }
+  return true;
+}
+
 bool
 ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
                                      unsigned &AddSubOpc,
@@ -2582,3 +2765,66 @@ ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
   HasLane = Entry.HasLane;
   return true;
 }
+
+//===----------------------------------------------------------------------===//
+// Execution domains.
+//===----------------------------------------------------------------------===//
+//
+// Some instructions go down the NEON pipeline, some go down the VFP pipeline,
+// and some can go down both.  The vmov instructions go down the VFP pipeline,
+// but they can be changed to vorr equivalents that are executed by the NEON
+// pipeline.
+//
+// We use the following execution domain numbering:
+//
+enum ARMExeDomain {
+  ExeGeneric = 0,
+  ExeVFP = 1,
+  ExeNEON = 2
+};
+//
+// Also see ARMInstrFormats.td and Domain* enums in ARMBaseInfo.h
+//
+std::pair<uint16_t, uint16_t>
+ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
+  // VMOVD is a VFP instruction, but can be changed to NEON if it isn't
+  // predicated.
+  if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
+    return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
+
+  // No other instructions can be swizzled, so just determine their domain.
+  unsigned Domain = MI->getDesc().TSFlags & ARMII::DomainMask;
+
+  if (Domain & ARMII::DomainNEON)
+    return std::make_pair(ExeNEON, 0);
+
+  // Certain instructions can go either way on Cortex-A8.
+  // Treat them as NEON instructions.
+  if ((Domain & ARMII::DomainNEONA8) && Subtarget.isCortexA8())
+    return std::make_pair(ExeNEON, 0);
+
+  if (Domain & ARMII::DomainVFP)
+    return std::make_pair(ExeVFP, 0);
+
+  return std::make_pair(ExeGeneric, 0);
+}
+
+void
+ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
+  // We only know how to change VMOVD into VORR.
+  assert(MI->getOpcode() == ARM::VMOVD && "Can only swizzle VMOVD");
+  if (Domain != ExeNEON)
+    return;
+
+  // Zap the predicate operands.
+  assert(!isPredicated(MI) && "Cannot predicate a VORRd");
+  MI->RemoveOperand(3);
+  MI->RemoveOperand(2);
+
+  // Change to a VORRd which requires two identical use operands.
+  MI->setDesc(get(ARM::VORRd));
+
+  // Add the extra source operand and new predicates.
+  // This will go before any implicit ops.
+  AddDefaultPred(MachineInstrBuilder(MI).addOperand(MI->getOperand(1)));
+}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 507e897..0f9f321 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -27,146 +27,6 @@ namespace llvm {
   class ARMSubtarget;
   class ARMBaseRegisterInfo;
 
-/// ARMII - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace ARMII {
-  enum {
-    //===------------------------------------------------------------------===//
-    // Instruction Flags.
-
-    //===------------------------------------------------------------------===//
-    // This four-bit field describes the addressing mode used.
-    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
-
-    // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
-    // and store ops only.  Generic "updating" flag is used for ld/st multiple.
-    // The index mode enums are declared in ARMBaseInfo.h
-    IndexModeShift = 5,
-    IndexModeMask  = 3 << IndexModeShift,
-
-    //===------------------------------------------------------------------===//
-    // Instruction encoding formats.
-    //
-    FormShift     = 7,
-    FormMask      = 0x3f << FormShift,
-
-    // Pseudo instructions
-    Pseudo        = 0  << FormShift,
-
-    // Multiply instructions
-    MulFrm        = 1  << FormShift,
-
-    // Branch instructions
-    BrFrm         = 2  << FormShift,
-    BrMiscFrm     = 3  << FormShift,
-
-    // Data Processing instructions
-    DPFrm         = 4  << FormShift,
-    DPSoRegFrm    = 5  << FormShift,
-
-    // Load and Store
-    LdFrm         = 6  << FormShift,
-    StFrm         = 7  << FormShift,
-    LdMiscFrm     = 8  << FormShift,
-    StMiscFrm     = 9  << FormShift,
-    LdStMulFrm    = 10 << FormShift,
-
-    LdStExFrm     = 11 << FormShift,
-
-    // Miscellaneous arithmetic instructions
-    ArithMiscFrm  = 12 << FormShift,
-    SatFrm        = 13 << FormShift,
-
-    // Extend instructions
-    ExtFrm        = 14 << FormShift,
-
-    // VFP formats
-    VFPUnaryFrm   = 15 << FormShift,
-    VFPBinaryFrm  = 16 << FormShift,
-    VFPConv1Frm   = 17 << FormShift,
-    VFPConv2Frm   = 18 << FormShift,
-    VFPConv3Frm   = 19 << FormShift,
-    VFPConv4Frm   = 20 << FormShift,
-    VFPConv5Frm   = 21 << FormShift,
-    VFPLdStFrm    = 22 << FormShift,
-    VFPLdStMulFrm = 23 << FormShift,
-    VFPMiscFrm    = 24 << FormShift,
-
-    // Thumb format
-    ThumbFrm      = 25 << FormShift,
-
-    // Miscelleaneous format
-    MiscFrm       = 26 << FormShift,
-
-    // NEON formats
-    NGetLnFrm     = 27 << FormShift,
-    NSetLnFrm     = 28 << FormShift,
-    NDupFrm       = 29 << FormShift,
-    NLdStFrm      = 30 << FormShift,
-    N1RegModImmFrm= 31 << FormShift,
-    N2RegFrm      = 32 << FormShift,
-    NVCVTFrm      = 33 << FormShift,
-    NVDupLnFrm    = 34 << FormShift,
-    N2RegVShLFrm  = 35 << FormShift,
-    N2RegVShRFrm  = 36 << FormShift,
-    N3RegFrm      = 37 << FormShift,
-    N3RegVShFrm   = 38 << FormShift,
-    NVExtFrm      = 39 << FormShift,
-    NVMulSLFrm    = 40 << FormShift,
-    NVTBLFrm      = 41 << FormShift,
-
-    //===------------------------------------------------------------------===//
-    // Misc flags.
-
-    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
-    // it doesn't have a Rn operand.
-    UnaryDP       = 1 << 13,
-
-    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
-    // a 16-bit Thumb instruction if certain conditions are met.
-    Xform16Bit    = 1 << 14,
-
-    //===------------------------------------------------------------------===//
-    // Code domain.
-    DomainShift   = 15,
-    DomainMask    = 7 << DomainShift,
-    DomainGeneral = 0 << DomainShift,
-    DomainVFP     = 1 << DomainShift,
-    DomainNEON    = 2 << DomainShift,
-    DomainNEONA8  = 4 << DomainShift,
-
-    //===------------------------------------------------------------------===//
-    // Field shifts - such shifts are used to set field while generating
-    // machine instructions.
-    //
-    // FIXME: This list will need adjusting/fixing as the MC code emitter
-    // takes shape and the ARMCodeEmitter.cpp bits go away.
-    ShiftTypeShift = 4,
-
-    M_BitShift     = 5,
-    ShiftImmShift  = 5,
-    ShiftShift     = 7,
-    N_BitShift     = 7,
-    ImmHiShift     = 8,
-    SoRotImmShift  = 8,
-    RegRsShift     = 8,
-    ExtRotImmShift = 10,
-    RegRdLoShift   = 12,
-    RegRdShift     = 12,
-    RegRdHiShift   = 16,
-    RegRnShift     = 16,
-    S_BitShift     = 20,
-    W_BitShift     = 21,
-    AM3_I_BitShift = 22,
-    D_BitShift     = 22,
-    U_BitShift     = 23,
-    P_BitShift     = 24,
-    I_BitShift     = 25,
-    CondShift      = 28
-  };
-}
-
 class ARMBaseInstrInfo : public ARMGenInstrInfo {
   const ARMSubtarget &Subtarget;
 
@@ -241,6 +101,10 @@ public:
                                        int &FrameIndex) const;
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const;
+  virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                             int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                            int &FrameIndex) const;
 
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, DebugLoc DL,
@@ -259,6 +123,8 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
                                                  int FrameIx,
                                                  uint64_t Offset,
@@ -346,6 +212,12 @@ public:
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
                         SDNode *UseNode, unsigned UseIdx) const;
+
+  /// VFP/NEON execution domains.
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+
 private:
   int getVLDMDefCycle(const InstrItineraryData *ItinData,
                       const MCInstrDesc &DefMCID,
@@ -382,6 +254,9 @@ private:
   bool hasLowDefLatency(const InstrItineraryData *ItinData,
                         const MachineInstr *DefMI, unsigned DefIdx) const;
 
+  /// verifyInstruction - Perform target specific instruction verification.
+  bool verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const;
+
 private:
   /// Modeling special VFP / NEON fp MLA / MLS hazards.
 
@@ -464,6 +339,12 @@ ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
 
 int getMatchingCondBranchOpcode(int Opc);
 
+
+/// Map pseudo instructions that imply an 'S' bit onto real opcodes. Whether
+/// the instruction is encoded with an 'S' bit is determined by the optional
+/// CPSR def operand.
+unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
+
 /// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
 /// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
 /// code.
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index ba42295..7c42342 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -12,13 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMFrameLowering.h"
 #include "ARMInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Support/Debug.h"
@@ -57,7 +56,7 @@ EnableBasePointer("arm-use-base-pointer", cl::Hidden, cl::init(true),
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
                                          const ARMSubtarget &sti)
-  : ARMGenRegisterInfo(), TII(tii), STI(sti),
+  : ARMGenRegisterInfo(ARM::LR), TII(tii), STI(sti),
     FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11),
     BasePtr(ARM::R6) {
 }
@@ -354,7 +353,7 @@ const TargetRegisterClass*
 ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
                                                                          const {
   const TargetRegisterClass *Super = RC;
-  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
   do {
     switch (Super->getID()) {
     case ARM::GPRRegClassID:
@@ -375,6 +374,13 @@ ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
   return ARM::GPRRegisterClass;
 }
 
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &ARM::CCRRegClass)
+    return 0;  // Can't copy CCR registers.
+  return RC;
+}
+
 unsigned
 ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                          MachineFunction &MF) const {
@@ -487,19 +493,19 @@ ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC,
 
     if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPREven1);
+        return makeArrayRef(GPREven1);
       else
-        return ArrayRef<unsigned>(GPREven4);
+        return makeArrayRef(GPREven4);
     } else if (FramePtr == ARM::R7) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPREven2);
+        return makeArrayRef(GPREven2);
       else
-        return ArrayRef<unsigned>(GPREven5);
+        return makeArrayRef(GPREven5);
     } else { // FramePtr == ARM::R11
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPREven3);
+        return makeArrayRef(GPREven3);
       else
-        return ArrayRef<unsigned>(GPREven6);
+        return makeArrayRef(GPREven6);
     }
   } else if (HintType == ARMRI::RegPairOdd) {
     if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
@@ -509,19 +515,19 @@ ARMBaseRegisterInfo::getRawAllocationOrder(const TargetRegisterClass *RC,
 
     if (!TFI->hasFP(MF)) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPROdd1);
+        return makeArrayRef(GPROdd1);
       else
-        return ArrayRef<unsigned>(GPROdd4);
+        return makeArrayRef(GPROdd4);
     } else if (FramePtr == ARM::R7) {
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPROdd2);
+        return makeArrayRef(GPROdd2);
       else
-        return ArrayRef<unsigned>(GPROdd5);
+        return makeArrayRef(GPROdd5);
     } else { // FramePtr == ARM::R11
       if (!STI.isR9Reserved())
-        return ArrayRef<unsigned>(GPROdd3);
+        return makeArrayRef(GPROdd3);
       else
-        return ArrayRef<unsigned>(GPROdd6);
+        return makeArrayRef(GPROdd6);
     }
   }
   return RC->getRawAllocationOrder(MF);
@@ -649,10 +655,6 @@ cannotEliminateFrame(const MachineFunction &MF) const {
     || needsStackRealignment(MF);
 }
 
-unsigned ARMBaseRegisterInfo::getRARegister() const {
-  return ARM::LR;
-}
-
 unsigned
 ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -672,99 +674,54 @@ unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
   return 0;
 }
 
-int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int ARMBaseRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return ARMGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
-
 unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
                                               const MachineFunction &MF) const {
   switch (Reg) {
   default: break;
   // Return 0 if either register of the pair is a special register.
   // So no R12, etc.
-  case ARM::R1:
-    return ARM::R0;
-  case ARM::R3:
-    return ARM::R2;
-  case ARM::R5:
-    return ARM::R4;
+  case ARM::R1: return ARM::R0;
+  case ARM::R3: return ARM::R2;
+  case ARM::R5: return ARM::R4;
   case ARM::R7:
     return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
       ? 0 : ARM::R6;
-  case ARM::R9:
-    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
-  case ARM::R11:
-    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
-
-  case ARM::S1:
-    return ARM::S0;
-  case ARM::S3:
-    return ARM::S2;
-  case ARM::S5:
-    return ARM::S4;
-  case ARM::S7:
-    return ARM::S6;
-  case ARM::S9:
-    return ARM::S8;
-  case ARM::S11:
-    return ARM::S10;
-  case ARM::S13:
-    return ARM::S12;
-  case ARM::S15:
-    return ARM::S14;
-  case ARM::S17:
-    return ARM::S16;
-  case ARM::S19:
-    return ARM::S18;
-  case ARM::S21:
-    return ARM::S20;
-  case ARM::S23:
-    return ARM::S22;
-  case ARM::S25:
-    return ARM::S24;
-  case ARM::S27:
-    return ARM::S26;
-  case ARM::S29:
-    return ARM::S28;
-  case ARM::S31:
-    return ARM::S30;
-
-  case ARM::D1:
-    return ARM::D0;
-  case ARM::D3:
-    return ARM::D2;
-  case ARM::D5:
-    return ARM::D4;
-  case ARM::D7:
-    return ARM::D6;
-  case ARM::D9:
-    return ARM::D8;
-  case ARM::D11:
-    return ARM::D10;
-  case ARM::D13:
-    return ARM::D12;
-  case ARM::D15:
-    return ARM::D14;
-  case ARM::D17:
-    return ARM::D16;
-  case ARM::D19:
-    return ARM::D18;
-  case ARM::D21:
-    return ARM::D20;
-  case ARM::D23:
-    return ARM::D22;
-  case ARM::D25:
-    return ARM::D24;
-  case ARM::D27:
-    return ARM::D26;
-  case ARM::D29:
-    return ARM::D28;
-  case ARM::D31:
-    return ARM::D30;
+  case ARM::R9: return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
+  case ARM::R11: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
+
+  case ARM::S1: return ARM::S0;
+  case ARM::S3: return ARM::S2;
+  case ARM::S5: return ARM::S4;
+  case ARM::S7: return ARM::S6;
+  case ARM::S9: return ARM::S8;
+  case ARM::S11: return ARM::S10;
+  case ARM::S13: return ARM::S12;
+  case ARM::S15: return ARM::S14;
+  case ARM::S17: return ARM::S16;
+  case ARM::S19: return ARM::S18;
+  case ARM::S21: return ARM::S20;
+  case ARM::S23: return ARM::S22;
+  case ARM::S25: return ARM::S24;
+  case ARM::S27: return ARM::S26;
+  case ARM::S29: return ARM::S28;
+  case ARM::S31: return ARM::S30;
+
+  case ARM::D1: return ARM::D0;
+  case ARM::D3: return ARM::D2;
+  case ARM::D5: return ARM::D4;
+  case ARM::D7: return ARM::D6;
+  case ARM::D9: return ARM::D8;
+  case ARM::D11: return ARM::D10;
+  case ARM::D13: return ARM::D12;
+  case ARM::D15: return ARM::D14;
+  case ARM::D17: return ARM::D16;
+  case ARM::D19: return ARM::D18;
+  case ARM::D21: return ARM::D20;
+  case ARM::D23: return ARM::D22;
+  case ARM::D25: return ARM::D24;
+  case ARM::D27: return ARM::D26;
+  case ARM::D29: return ARM::D28;
+  case ARM::D31: return ARM::D30;
   }
 
   return 0;
@@ -776,85 +733,48 @@ unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
   default: break;
   // Return 0 if either register of the pair is a special register.
   // So no R12, etc.
-  case ARM::R0:
-    return ARM::R1;
-  case ARM::R2:
-    return ARM::R3;
-  case ARM::R4:
-    return ARM::R5;
+  case ARM::R0: return ARM::R1;
+  case ARM::R2: return ARM::R3;
+  case ARM::R4: return ARM::R5;
   case ARM::R6:
     return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
       ? 0 : ARM::R7;
-  case ARM::R8:
-    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
-  case ARM::R10:
-    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
-
-  case ARM::S0:
-    return ARM::S1;
-  case ARM::S2:
-    return ARM::S3;
-  case ARM::S4:
-    return ARM::S5;
-  case ARM::S6:
-    return ARM::S7;
-  case ARM::S8:
-    return ARM::S9;
-  case ARM::S10:
-    return ARM::S11;
-  case ARM::S12:
-    return ARM::S13;
-  case ARM::S14:
-    return ARM::S15;
-  case ARM::S16:
-    return ARM::S17;
-  case ARM::S18:
-    return ARM::S19;
-  case ARM::S20:
-    return ARM::S21;
-  case ARM::S22:
-    return ARM::S23;
-  case ARM::S24:
-    return ARM::S25;
-  case ARM::S26:
-    return ARM::S27;
-  case ARM::S28:
-    return ARM::S29;
-  case ARM::S30:
-    return ARM::S31;
-
-  case ARM::D0:
-    return ARM::D1;
-  case ARM::D2:
-    return ARM::D3;
-  case ARM::D4:
-    return ARM::D5;
-  case ARM::D6:
-    return ARM::D7;
-  case ARM::D8:
-    return ARM::D9;
-  case ARM::D10:
-    return ARM::D11;
-  case ARM::D12:
-    return ARM::D13;
-  case ARM::D14:
-    return ARM::D15;
-  case ARM::D16:
-    return ARM::D17;
-  case ARM::D18:
-    return ARM::D19;
-  case ARM::D20:
-    return ARM::D21;
-  case ARM::D22:
-    return ARM::D23;
-  case ARM::D24:
-    return ARM::D25;
-  case ARM::D26:
-    return ARM::D27;
-  case ARM::D28:
-    return ARM::D29;
-  case ARM::D30:
-    return ARM::D31;
+  case ARM::R8: return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
+  case ARM::R10: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
+
+  case ARM::S0: return ARM::S1;
+  case ARM::S2: return ARM::S3;
+  case ARM::S4: return ARM::S5;
+  case ARM::S6: return ARM::S7;
+  case ARM::S8: return ARM::S9;
+  case ARM::S10: return ARM::S11;
+  case ARM::S12: return ARM::S13;
+  case ARM::S14: return ARM::S15;
+  case ARM::S16: return ARM::S17;
+  case ARM::S18: return ARM::S19;
+  case ARM::S20: return ARM::S21;
+  case ARM::S22: return ARM::S23;
+  case ARM::S24: return ARM::S25;
+  case ARM::S26: return ARM::S27;
+  case ARM::S28: return ARM::S29;
+  case ARM::S30: return ARM::S31;
+
+  case ARM::D0: return ARM::D1;
+  case ARM::D2: return ARM::D3;
+  case ARM::D4: return ARM::D5;
+  case ARM::D6: return ARM::D7;
+  case ARM::D8: return ARM::D9;
+  case ARM::D10: return ARM::D11;
+  case ARM::D12: return ARM::D13;
+  case ARM::D14: return ARM::D15;
+  case ARM::D16: return ARM::D17;
+  case ARM::D18: return ARM::D19;
+  case ARM::D20: return ARM::D21;
+  case ARM::D22: return ARM::D23;
+  case ARM::D24: return ARM::D25;
+  case ARM::D26: return ARM::D27;
+  case ARM::D28: return ARM::D29;
+  case ARM::D30: return ARM::D31;
   }
 
   return 0;
@@ -1111,11 +1031,11 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this));
 
-  MachineInstrBuilder MIB = BuildMI(*MBB, Ins, DL, MCID, BaseReg)
-    .addFrameIndex(FrameIdx).addImm(Offset);
+  MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+    .addFrameIndex(FrameIdx).addImm(Offset));
 
   if (!AFI->isThumb1OnlyFunction())
-    AddDefaultCC(AddDefaultPred(MIB));
+    AddDefaultCC(MIB);
 }
 
 void
@@ -1143,6 +1063,7 @@ ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
     Done = rewriteT2FrameIndex(MI, i, BaseReg, Off, TII);
   }
   assert (Done && "Unable to resolve frame index!");
+  (void)Done;
 }
 
 bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index b4b4059..fee17ff 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -33,19 +33,6 @@ namespace ARMRI {
   };
 }
 
-/// isARMLowRegister - Returns true if the register is low register r0-r7.
-///
-static inline bool isARMLowRegister(unsigned Reg) {
-  using namespace ARM;
-  switch (Reg) {
-  case R0:  case R1:  case R2:  case R3:
-  case R4:  case R5:  case R6:  case R7:
-    return true;
-  default:
-    return false;
-  }
-}
-
 /// isARMArea1Register - Returns true if the register is a low register (r0-r7)
 /// or a stack/pc register that we should push/pop.
 static inline bool isARMArea1Register(unsigned Reg, bool isDarwin) {
@@ -129,6 +116,8 @@ public:
                                        unsigned &NewSubIdx) const;
 
   const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+  const TargetRegisterClass*
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
 
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
@@ -164,7 +153,6 @@ public:
   bool cannotEliminateFrame(const MachineFunction &MF) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getBaseRegister() const { return BasePtr; }
 
@@ -172,9 +160,6 @@ public:
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
 
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
   bool isLowRegister(unsigned Reg) const;
 
 
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index d6fca62..4148d4a 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -14,12 +14,12 @@
 
 #define DEBUG_TYPE "jit"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMInstrInfo.h"
 #include "ARMRelocations.h"
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -161,11 +161,11 @@ namespace {
     //  are already handled elsewhere. They are placeholders to allow this
     //  encoder to continue to function until the MC encoder is sufficiently
     //  far along that this one can be eliminated entirely.
-    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val) 
+    unsigned NEONThumb2DataIPostEncoder(const MachineInstr &MI, unsigned Val)
       const { return 0; }
-    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val) 
+    unsigned NEONThumb2LoadStorePostEncoder(const MachineInstr &MI,unsigned Val)
       const { return 0; }
-    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val) 
+    unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val)
       const { return 0; }
     unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
       const { return 0; }
@@ -189,13 +189,17 @@ namespace {
       unsigned Op) const { return 0; }
     unsigned getARMBranchTargetOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getARMBLXTargetOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
     unsigned getCCOutOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getSOImmOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getT2SOImmOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
-    unsigned getSORegOpValue(const MachineInstr &MI, unsigned Op)
+    unsigned getSORegRegOpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
+    unsigned getSORegImmOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getThumbAddrModeRegRegOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
@@ -203,8 +207,12 @@ namespace {
       const { return 0; }
     unsigned getT2AddrModeImm8OpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getT2Imm8s4OpValue(const MachineInstr &MI, unsigned Op)
+      const { return 0; }
     unsigned getT2AddrModeImm8s4OpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
+    unsigned getT2AddrModeImm0_1020s4OpValue(const MachineInstr &MI,unsigned Op)
+      const { return 0; }
     unsigned getT2AddrModeImm8OffsetOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getT2AddrModeImm12OffsetOpValue(const MachineInstr &MI,unsigned Op)
@@ -213,10 +221,6 @@ namespace {
       const { return 0; }
     unsigned getT2SORegOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
-    unsigned getRotImmOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
-    unsigned getImmMinusOneOpValue(const MachineInstr &MI, unsigned Op)
-      const { return 0; }
     unsigned getT2AdrLabelOpValue(const MachineInstr &MI, unsigned Op)
       const { return 0; }
     unsigned getAddrMode6AddressOpValue(const MachineInstr &MI, unsigned Op)
@@ -230,8 +234,6 @@ namespace {
       const { return 0; }
     unsigned getBitfieldInvertedMaskOpValue(const MachineInstr &MI,
                                             unsigned Op) const { return 0; }
-    unsigned getMsbOpValue(const MachineInstr &MI,
-                           unsigned Op) const { return 0; }
     unsigned getSsatBitPosValue(const MachineInstr &MI,
                                 unsigned Op) const { return 0; }
     uint32_t getLdStmModeOpValue(const MachineInstr &MI, unsigned OpIdx)
@@ -268,6 +270,8 @@ namespace {
       const { return 0;}
     uint32_t getAddrMode2OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
       const { return 0;}
+    uint32_t getPostIdxRegOpValue(const MachineInstr &MI, unsigned OpIdx)
+      const { return 0;}
     uint32_t getAddrMode3OffsetOpValue(const MachineInstr &MI, unsigned OpIdx)
       const { return 0;}
     uint32_t getAddrMode3OpValue(const MachineInstr &MI, unsigned Op)
@@ -632,15 +636,16 @@ void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
           << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
 
     assert(ACPV->isGlobalValue() && "unsupported constant pool value");
-    const GlobalValue *GV = ACPV->getGV();
+    const GlobalValue *GV = cast<ARMConstantPoolConstant>(ACPV)->getGV();
     if (GV) {
       Reloc::Model RelocM = TM.getRelocationModel();
       emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
                         isa<Function>(GV),
                         Subtarget->GVIsIndirectSymbol(GV, RelocM),
                         (intptr_t)ACPV);
-     } else  {
-      emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute);
+    } else  {
+      const char *Sym = cast<ARMConstantPoolSymbol>(ACPV)->getSymbol();
+      emitExternalSymbolAddress(Sym, ARM::reloc_arm_absolute);
     }
     emitWordLE(0);
   } else {
@@ -983,7 +988,7 @@ unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) {
 
 unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI,
                                          const MCInstrDesc &MCID) const {
-  for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e; --i){
+  for (unsigned i = MI.getNumOperands(), e = MCID.getNumOperands(); i >= e;--i){
     const MachineOperand &MO = MI.getOperand(i-1);
     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
       return 1 << ARMII::S_BitShift;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index f45ebdc..3e3a413 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -15,10 +15,10 @@
 
 #define DEBUG_TYPE "arm-cp-islands"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMInstrInfo.h"
 #include "Thumb2InstrInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -739,7 +739,11 @@ MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond to anything in the source.
   unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
-  BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+  if (!isThumb)
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB);
+  else
+    BuildMI(OrigBB, DebugLoc(), TII->get(Opc)).addMBB(NewBB)
+            .addImm(ARMCC::AL).addReg(0);
   ++NumSplit;
 
   // Update the CFG.  All succs of OrigBB are now succs of NewBB.
@@ -1151,7 +1155,11 @@ void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
     // targets will be exchanged, and the altered branch may be out of
     // range, so the machinery has to know about it.
     int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
-    BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+    if (!isThumb)
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+    else
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB)
+              .addImm(ARMCC::AL).addReg(0);
     unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
     ImmBranches.push_back(ImmBranch(&UserMBB->back(),
                           MaxDisp, false, UncondBr));
@@ -1512,7 +1520,11 @@ ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
     .addMBB(NextBB).addImm(CC).addReg(CCReg);
   Br.MI = &MBB->back();
   BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
-  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  if (isThumb)
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB)
+            .addImm(ARMCC::AL).addReg(0);
+  else
+    BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
   BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
   unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
   ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
@@ -1891,7 +1903,8 @@ AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond directly to anything in the source.
   assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?");
-  BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB);
+  BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB)
+          .addImm(ARMCC::AL).addReg(0);
 
   // Update internal data structures to account for the newly inserted MBB.
   MF.RenumberBlocks(NewBB);
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 165a1d8..aadfd47 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -17,79 +17,57 @@
 #include "llvm/Constants.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Type.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 using namespace llvm;
 
-ARMConstantPoolValue::ARMConstantPoolValue(const Constant *cval, unsigned id,
-                                           ARMCP::ARMCPKind K,
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolValue
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolValue::ARMConstantPoolValue(Type *Ty, unsigned id,
+                                           ARMCP::ARMCPKind kind,
                                            unsigned char PCAdj,
-                                           ARMCP::ARMCPModifier Modif,
-                                           bool AddCA)
-  : MachineConstantPoolValue((const Type*)cval->getType()),
-    CVal(cval), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj),
-    Modifier(Modif), AddCurrentAddress(AddCA) {}
-
-ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C,
-                                           const char *s, unsigned id,
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue(Ty), LabelId(id), Kind(kind),
+    PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C, unsigned id,
+                                           ARMCP::ARMCPKind kind,
                                            unsigned char PCAdj,
-                                           ARMCP::ARMCPModifier Modif,
-                                           bool AddCA)
-  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)),
-    CVal(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPExtSymbol),
-    PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {}
-
-ARMConstantPoolValue::ARMConstantPoolValue(const GlobalValue *gv,
-                                           ARMCP::ARMCPModifier Modif)
-  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())),
-    CVal(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0),
-    Modifier(Modif), AddCurrentAddress(false) {}
-
-const GlobalValue *ARMConstantPoolValue::getGV() const {
-  return dyn_cast_or_null<GlobalValue>(CVal);
-}
+                                           ARMCP::ARMCPModifier modifier,
+                                           bool addCurrentAddress)
+  : MachineConstantPoolValue((Type*)Type::getInt32Ty(C)),
+    LabelId(id), Kind(kind), PCAdjust(PCAdj), Modifier(modifier),
+    AddCurrentAddress(addCurrentAddress) {}
 
-const BlockAddress *ARMConstantPoolValue::getBlockAddress() const {
-  return dyn_cast_or_null<BlockAddress>(CVal);
-}
+ARMConstantPoolValue::~ARMConstantPoolValue() {}
 
-static bool CPV_streq(const char *S1, const char *S2) {
-  if (S1 == S2)
-    return true;
-  if (S1 && S2 && strcmp(S1, S2) == 0)
-    return true;
-  return false;
+const char *ARMConstantPoolValue::getModifierText() const {
+  switch (Modifier) {
+  default: llvm_unreachable("Unknown modifier!");
+    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
+    // strings if that's legal.
+  case ARMCP::no_modifier: return "none";
+  case ARMCP::TLSGD:       return "tlsgd";
+  case ARMCP::GOT:         return "GOT";
+  case ARMCP::GOTOFF:      return "GOTOFF";
+  case ARMCP::GOTTPOFF:    return "gottpoff";
+  case ARMCP::TPOFF:       return "tpoff";
+  }
 }
 
 int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
                                                     unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      if (CPV->CVal == CVal &&
-          CPV->LabelId == LabelId &&
-          CPV->PCAdjust == PCAdjust &&
-          CPV_streq(CPV->S, S) &&
-          CPV->Modifier == Modifier)
-        return i;
-    }
-  }
-
+  assert(false && "Shouldn't be calling this directly!");
   return -1;
 }
 
-ARMConstantPoolValue::~ARMConstantPoolValue() {
-  free((void*)S);
-}
-
 void
-ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
-  ID.AddPointer(CVal);
-  ID.AddPointer(S);
+ARMConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
   ID.AddInteger(LabelId);
   ID.AddInteger(PCAdjust);
 }
@@ -97,9 +75,7 @@ ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
 bool
 ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
   if (ACPV->Kind == Kind &&
-      ACPV->CVal == CVal &&
       ACPV->PCAdjust == PCAdjust &&
-      CPV_streq(ACPV->S, S) &&
       ACPV->Modifier == Modifier) {
     if (ACPV->LabelId == LabelId)
       return true;
@@ -115,12 +91,7 @@ void ARMConstantPoolValue::dump() const {
   errs() << "  " << *this;
 }
 
-
 void ARMConstantPoolValue::print(raw_ostream &O) const {
-  if (CVal)
-    O << CVal->getName();
-  else
-    O << S;
   if (Modifier) O << "(" << getModifierText() << ")";
   if (PCAdjust != 0) {
     O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
@@ -128,3 +99,221 @@ void ARMConstantPoolValue::print(raw_ostream &O) const {
     O << ")";
   }
 }
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolConstant
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(Type *Ty,
+                                                 const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue(Ty, ID, Kind, PCAdj, Modifier, AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant::ARMConstantPoolConstant(const Constant *C,
+                                                 unsigned ID,
+                                                 ARMCP::ARMCPKind Kind,
+                                                 unsigned char PCAdj,
+                                                 ARMCP::ARMCPModifier Modifier,
+                                                 bool AddCurrentAddress)
+  : ARMConstantPoolValue((Type*)C->getType(), ID, Kind, PCAdj, Modifier,
+                         AddCurrentAddress),
+    CVal(C) {}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID) {
+  return new ARMConstantPoolConstant(C, ID, ARMCP::CPValue, 0,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const GlobalValue *GV,
+                                ARMCP::ARMCPModifier Modifier) {
+  return new ARMConstantPoolConstant((Type*)Type::getInt32Ty(GV->getContext()),
+                                     GV, 0, ARMCP::CPValue, 0,
+                                     Modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj,
+                                     ARMCP::no_modifier, false);
+}
+
+ARMConstantPoolConstant *
+ARMConstantPoolConstant::Create(const Constant *C, unsigned ID,
+                                ARMCP::ARMCPKind Kind, unsigned char PCAdj,
+                                ARMCP::ARMCPModifier Modifier,
+                                bool AddCurrentAddress) {
+  return new ARMConstantPoolConstant(C, ID, Kind, PCAdj, Modifier,
+                                     AddCurrentAddress);
+}
+
+const GlobalValue *ARMConstantPoolConstant::getGV() const {
+  return dyn_cast_or_null<GlobalValue>(CVal);
+}
+
+const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
+  return dyn_cast_or_null<BlockAddress>(CVal);
+}
+
+int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                       unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolConstant *APC = dyn_cast<ARMConstantPoolConstant>(CPV);
+      if (!APC) continue;
+      if (APC->CVal == CVal && equals(APC))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolConstant *ACPC = dyn_cast<ARMConstantPoolConstant>(ACPV);
+  return ACPC && ACPC->CVal == CVal && ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(CVal);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolConstant::print(raw_ostream &O) const {
+  O << CVal->getName();
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolSymbol
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, const char *s,
+                                             unsigned id,
+                                             unsigned char PCAdj,
+                                             ARMCP::ARMCPModifier Modifier,
+                                             bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
+                         AddCurrentAddress),
+    S(strdup(s)) {}
+
+ARMConstantPoolSymbol::~ARMConstantPoolSymbol() {
+  free((void*)S);
+}
+
+ARMConstantPoolSymbol *
+ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s,
+                              unsigned ID, unsigned char PCAdj) {
+  return new ARMConstantPoolSymbol(C, s, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+static bool CPV_streq(const char *S1, const char *S2) {
+  if (S1 == S2)
+    return true;
+  if (S1 && S2 && strcmp(S1, S2) == 0)
+    return true;
+  return false;
+}
+
+int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                     unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolSymbol *APS = dyn_cast<ARMConstantPoolSymbol>(CPV);
+      if (!APS) continue;
+
+      if (CPV_streq(APS->S, S) && equals(APS))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolSymbol *ACPS = dyn_cast<ARMConstantPoolSymbol>(ACPV);
+  return ACPS && CPV_streq(ACPS->S, S) &&
+    ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(S);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolSymbol::print(raw_ostream &O) const {
+  O << S;
+  ARMConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// ARMConstantPoolMBB
+//===----------------------------------------------------------------------===//
+
+ARMConstantPoolMBB::ARMConstantPoolMBB(LLVMContext &C,
+                                       const MachineBasicBlock *mbb,
+                                       unsigned id, unsigned char PCAdj,
+                                       ARMCP::ARMCPModifier Modifier,
+                                       bool AddCurrentAddress)
+  : ARMConstantPoolValue(C, id, ARMCP::CPMachineBasicBlock, PCAdj,
+                         Modifier, AddCurrentAddress),
+    MBB(mbb) {}
+
+ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
+                                               const MachineBasicBlock *mbb,
+                                               unsigned ID,
+                                               unsigned char PCAdj) {
+  return new ARMConstantPoolMBB(C, mbb, ID, PCAdj, ARMCP::no_modifier, false);
+}
+
+int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                  unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      ARMConstantPoolMBB *APMBB = dyn_cast<ARMConstantPoolMBB>(CPV);
+      if (!APMBB) continue;
+
+      if (APMBB->MBB == MBB && equals(APMBB))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
+  const ARMConstantPoolMBB *ACPMBB = dyn_cast<ARMConstantPoolMBB>(ACPV);
+  return ACPMBB && ACPMBB->MBB == MBB &&
+    ARMConstantPoolValue::hasSameValue(ACPV);
+}
+
+void ARMConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(MBB);
+  ARMConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void ARMConstantPoolMBB::print(raw_ostream &O) const {
+  ARMConstantPoolValue::print(O);
+}
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index d008811..0d0def3 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -20,17 +20,19 @@
 
 namespace llvm {
 
-class Constant;
 class BlockAddress;
+class Constant;
 class GlobalValue;
 class LLVMContext;
+class MachineBasicBlock;
 
 namespace ARMCP {
   enum ARMCPKind {
     CPValue,
     CPExtSymbol,
     CPBlockAddress,
-    CPLSDA
+    CPLSDA,
+    CPMachineBasicBlock
   };
 
   enum ARMCPModifier {
@@ -47,8 +49,6 @@ namespace ARMCP {
 /// represent PC-relative displacement between the address of the load
 /// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
 class ARMConstantPoolValue : public MachineConstantPoolValue {
-  const Constant *CVal;    // Constant being loaded.
-  const char *S;           // ExtSymbol being loaded.
   unsigned LabelId;        // Label id of the load.
   ARMCP::ARMCPKind Kind;   // Kind of constant.
   unsigned char PCAdjust;  // Extra adjustment if constantpool is pc-relative.
@@ -56,60 +56,54 @@ class ARMConstantPoolValue : public MachineConstantPoolValue {
   ARMCP::ARMCPModifier Modifier;   // GV modifier i.e. (&GV(modifier)-(LPIC+8))
   bool AddCurrentAddress;
 
+protected:
+  ARMConstantPoolValue(Type *Ty, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
+
+  ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
+                       unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                       bool AddCurrentAddress);
 public:
-  ARMConstantPoolValue(const Constant *cval, unsigned id,
-                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
-                       unsigned char PCAdj = 0,
-                       ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier,
-                       bool AddCurrentAddress = false);
-  ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id,
-                       unsigned char PCAdj = 0,
-                       ARMCP::ARMCPModifier Modifier = ARMCP::no_modifier,
-                       bool AddCurrentAddress = false);
-  ARMConstantPoolValue(const GlobalValue *GV, ARMCP::ARMCPModifier Modifier);
-  ARMConstantPoolValue();
-  ~ARMConstantPoolValue();
+  virtual ~ARMConstantPoolValue();
 
-  const GlobalValue *getGV() const;
-  const char *getSymbol() const { return S; }
-  const BlockAddress *getBlockAddress() const;
   ARMCP::ARMCPModifier getModifier() const { return Modifier; }
-  const char *getModifierText() const {
-    switch (Modifier) {
-    default: llvm_unreachable("Unknown modifier!");
-    // FIXME: Are these case sensitive? It'd be nice to lower-case all the
-    // strings if that's legal.
-    case ARMCP::no_modifier: return "none";
-    case ARMCP::TLSGD:       return "tlsgd";
-    case ARMCP::GOT:         return "GOT";
-    case ARMCP::GOTOFF:      return "GOTOFF";
-    case ARMCP::GOTTPOFF:    return "gottpoff";
-    case ARMCP::TPOFF:       return "tpoff";
-    }
-  }
+  const char *getModifierText() const;
   bool hasModifier() const { return Modifier != ARMCP::no_modifier; }
+
   bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+
   unsigned getLabelId() const { return LabelId; }
   unsigned char getPCAdjustment() const { return PCAdjust; }
+
   bool isGlobalValue() const { return Kind == ARMCP::CPValue; }
   bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; }
-  bool isBlockAddress() { return Kind == ARMCP::CPBlockAddress; }
-  bool isLSDA() { return Kind == ARMCP::CPLSDA; }
+  bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; }
+  bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
+  bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
 
   virtual unsigned getRelocationInfo() const { return 2; }
 
   virtual int getExistingMachineCPValue(MachineConstantPool *CP,
                                         unsigned Alignment);
 
-  virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
 
-  /// hasSameValue - Return true if this ARM constpool value
-  /// can share the same constantpool entry as another ARM constpool value.
-  bool hasSameValue(ARMConstantPoolValue *ACPV);
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  bool equals(const ARMConstantPoolValue *A) const {
+    return this->LabelId == A->LabelId &&
+      this->PCAdjust == A->PCAdjust &&
+      this->Modifier == A->Modifier;
+  }
 
+  virtual void print(raw_ostream &O) const;
   void print(raw_ostream *O) const { if (O) print(*O); }
-  void print(raw_ostream &O) const;
   void dump() const;
+
+  static bool classof(const ARMConstantPoolValue *) { return true; }
 };
 
 inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
@@ -117,6 +111,123 @@ inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
   return O;
 }
 
+/// ARMConstantPoolConstant - ARM-specific constant pool values for Constants,
+/// Functions, and BlockAddresses.
+class ARMConstantPoolConstant : public ARMConstantPoolValue {
+  const Constant *CVal;         // Constant being loaded.
+
+  ARMConstantPoolConstant(const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+  ARMConstantPoolConstant(Type *Ty, const Constant *C,
+                          unsigned ID,
+                          ARMCP::ARMCPKind Kind,
+                          unsigned char PCAdj,
+                          ARMCP::ARMCPModifier Modifier,
+                          bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID);
+  static ARMConstantPoolConstant *Create(const GlobalValue *GV,
+                                         ARMCP::ARMCPModifier Modifier);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj);
+  static ARMConstantPoolConstant *Create(const Constant *C, unsigned ID,
+                                         ARMCP::ARMCPKind Kind,
+                                         unsigned char PCAdj,
+                                         ARMCP::ARMCPModifier Modifier,
+                                         bool AddCurrentAddress);
+
+  const GlobalValue *getGV() const;
+  const BlockAddress *getBlockAddress() const;
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  virtual void print(raw_ostream &O) const;
+  static bool classof(const ARMConstantPoolValue *APV) {
+    return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
+  }
+  static bool classof(const ARMConstantPoolConstant *) { return true; }
+};
+
+/// ARMConstantPoolSymbol - ARM-specific constantpool values for external
+/// symbols.
+class ARMConstantPoolSymbol : public ARMConstantPoolValue {
+  const char *S;                // ExtSymbol being loaded.
+
+  ARMConstantPoolSymbol(LLVMContext &C, const char *s, unsigned id,
+                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                        bool AddCurrentAddress);
+
+public:
+  ~ARMConstantPoolSymbol();
+
+  static ARMConstantPoolSymbol *Create(LLVMContext &C, const char *s,
+                                       unsigned ID, unsigned char PCAdj);
+
+  const char *getSymbol() const { return S; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void print(raw_ostream &O) const;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isExtSymbol();
+  }
+  static bool classof(const ARMConstantPoolSymbol *) { return true; }
+};
+
+/// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
+/// block.
+class ARMConstantPoolMBB : public ARMConstantPoolValue {
+  const MachineBasicBlock *MBB; // Machine basic block.
+
+  ARMConstantPoolMBB(LLVMContext &C, const MachineBasicBlock *mbb, unsigned id,
+                     unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
+                     bool AddCurrentAddress);
+
+public:
+  static ARMConstantPoolMBB *Create(LLVMContext &C,
+                                    const MachineBasicBlock *mbb,
+                                    unsigned ID, unsigned char PCAdj);
+
+  const MachineBasicBlock *getMBB() const { return MBB; }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value can share the same
+  /// constantpool entry as another ARM constpool value.
+  virtual bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  virtual void print(raw_ostream &O) const;
+
+  static bool classof(const ARMConstantPoolValue *ACPV) {
+    return ACPV->isMachineBasicBlock();
+  }
+  static bool classof(const ARMConstantPoolMBB *) { return true; }
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 94b72fd..7872cb9 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -16,19 +16,24 @@
 
 #define DEBUG_TYPE "arm-pseudo"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
 using namespace llvm;
 
+static cl::opt<bool>
+VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
+                cl::desc("Verify machine code after expanding ARM pseudos"));
+
 namespace {
   class ARMExpandPseudo : public MachineFunctionPass {
   public:
@@ -741,8 +746,22 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
-    case ARM::MOVCCs: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+    case ARM::MOVCCsi: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
+              (MI.getOperand(1).getReg()))
+        .addReg(MI.getOperand(2).getReg(),
+                getKillRegState(MI.getOperand(2).isKill()))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addReg(MI.getOperand(5).getReg())
+        .addReg(0); // 's' bit
+
+      MI.eraseFromParent();
+      return true;
+    }
+
+    case ARM::MOVCCsr: {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
               (MI.getOperand(1).getReg()))
         .addReg(MI.getOperand(2).getReg(),
                 getKillRegState(MI.getOperand(2).isKill()))
@@ -837,10 +856,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVsrl_flag:
     case ARM::MOVsra_flag: {
       // These are just fancy MOVs insructions.
-      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+      AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
                              MI.getOperand(0).getReg())
                      .addOperand(MI.getOperand(1))
-                     .addReg(0)
                      .addImm(ARM_AM::getSORegOpc((Opcode == ARM::MOVsrl_flag ?
                                                   ARM_AM::lsr : ARM_AM::asr),
                                                  1)))
@@ -851,10 +869,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::RRX: {
       // This encodes as "MOVs Rd, Rm, rrx
       MachineInstrBuilder MIB =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVs),
+        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),TII->get(ARM::MOVsi),
                                MI.getOperand(0).getReg())
                        .addOperand(MI.getOperand(1))
-                       .addOperand(MI.getOperand(1))
                        .addImm(ARM_AM::getSORegOpc(ARM_AM::rrx, 0)))
         .addReg(0);
       TransferImpOps(MI, MIB, MIB);
@@ -953,34 +970,6 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       ExpandMOV32BitImm(MBB, MBBI);
       return true;
 
-    case ARM::VMOVQQ: {
-      unsigned DstReg = MI.getOperand(0).getReg();
-      bool DstIsDead = MI.getOperand(0).isDead();
-      unsigned EvenDst = TRI->getSubReg(DstReg, ARM::qsub_0);
-      unsigned OddDst  = TRI->getSubReg(DstReg, ARM::qsub_1);
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      bool SrcIsKill = MI.getOperand(1).isKill();
-      unsigned EvenSrc = TRI->getSubReg(SrcReg, ARM::qsub_0);
-      unsigned OddSrc  = TRI->getSubReg(SrcReg, ARM::qsub_1);
-      MachineInstrBuilder Even =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                               TII->get(ARM::VORRq))
-                       .addReg(EvenDst,
-                               RegState::Define | getDeadRegState(DstIsDead))
-                       .addReg(EvenSrc, getKillRegState(SrcIsKill))
-                       .addReg(EvenSrc, getKillRegState(SrcIsKill)));
-      MachineInstrBuilder Odd =
-        AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                               TII->get(ARM::VORRq))
-                       .addReg(OddDst,
-                               RegState::Define | getDeadRegState(DstIsDead))
-                       .addReg(OddSrc, getKillRegState(SrcIsKill))
-                       .addReg(OddSrc, getKillRegState(SrcIsKill)));
-      TransferImpOps(MI, Even, Odd);
-      MI.eraseFromParent();
-      return true;
-    }
-
     case ARM::VLDMQIA: {
       unsigned NewOpc = ARM::VLDMDIA;
       MachineInstrBuilder MIB =
@@ -1316,6 +1305,8 @@ bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
        ++MFI)
     Modified |= ExpandMBB(*MFI);
+  if (VerifyARMPseudo)
+    MF.verify(this, "After expanding ARM pseudo instructions.");
   return Modified;
 }
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index f469d7e..9bc7ef2 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -14,13 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMRegisterInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMSubtarget.h"
 #include "ARMConstantPoolValue.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CallingConv.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
@@ -171,8 +171,8 @@ class ARMFastISel : public FastISel {
 
     // Utility routines.
   private:
-    bool isTypeLegal(const Type *Ty, MVT &VT);
-    bool isLoadTypeLegal(const Type *Ty, MVT &VT);
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr);
     bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr);
     bool ARMComputeAddress(const Value *Obj, Address &Addr);
@@ -502,11 +502,19 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, EVT VT) {
   // This checks to see if we can use VFP3 instructions to materialize
   // a constant, otherwise we have to go through the constant pool.
   if (TLI.isFPImmLegal(Val, VT)) {
-    unsigned Opc = is64bit ? ARM::FCONSTD : ARM::FCONSTS;
+    int Imm;
+    unsigned Opc;
+    if (is64bit) {
+      Imm = ARM_AM::getFP64Imm(Val);
+      Opc = ARM::FCONSTD;
+    } else {
+      Imm = ARM_AM::getFP32Imm(Val);
+      Opc = ARM::FCONSTS;
+    }
     unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
                             DestReg)
-                    .addFPImm(CFP));
+                    .addImm(Imm));
     return DestReg;
   }
 
@@ -590,8 +598,9 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
   // Grab index.
   unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb() ? 4 : 8);
   unsigned Id = AFI->createPICLabelUId();
-  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, Id,
-                                                       ARMCP::CPValue, PCAdj);
+  ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
+                                                              ARMCP::CPValue,
+                                                              PCAdj);
   unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
 
   // Load value.
@@ -615,8 +624,8 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
   if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb)
-      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::t2LDRi12),
-                    NewDestReg)
+      MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                    TII.get(ARM::t2LDRi12), NewDestReg)
             .addReg(DestReg)
             .addImm(0);
     else
@@ -673,7 +682,7 @@ unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   return 0;
 }
 
-bool ARMFastISel::isTypeLegal(const Type *Ty, MVT &VT) {
+bool ARMFastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(Ty, true);
 
   // Only handle simple types.
@@ -685,7 +694,7 @@ bool ARMFastISel::isTypeLegal(const Type *Ty, MVT &VT) {
   return TLI.isTypeLegal(VT);
 }
 
-bool ARMFastISel::isLoadTypeLegal(const Type *Ty, MVT &VT) {
+bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
   if (isTypeLegal(Ty, VT)) return true;
 
   // If this is a type than can be sign or zero-extended to a basic operation
@@ -714,7 +723,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
     U = C;
   }
 
-  if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
+  if (PointerType *Ty = dyn_cast<PointerType>(Obj->getType()))
     if (Ty->getAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
@@ -749,7 +758,7 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
       for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
            i != e; ++i, ++GTI) {
         const Value *Op = *i;
-        if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
           const StructLayout *SL = TD.getStructLayout(STy);
           unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
           TmpOffset += SL->getElementOffset(Idx);
@@ -946,6 +955,10 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr) {
 }
 
 bool ARMFastISel::SelectLoad(const Instruction *I) {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getType(), VT))
@@ -1008,6 +1021,10 @@ bool ARMFastISel::SelectStore(const Instruction *I) {
   Value *Op0 = I->getOperand(0);
   unsigned SrcReg = 0;
 
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
   // Verify we have a legal type before going any further.
   MVT VT;
   if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
@@ -1085,7 +1102,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // TODO: Factor this out.
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     MVT SourceVT;
-    const Type *Ty = CI->getOperand(0)->getType();
+    Type *Ty = CI->getOperand(0)->getType();
     if (CI->hasOneUse() && (CI->getParent() == I->getParent())
         && isTypeLegal(Ty, SourceVT)) {
       bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
@@ -1201,7 +1218,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
 
   MVT VT;
-  const Type *Ty = CI->getOperand(0)->getType();
+  Type *Ty = CI->getOperand(0)->getType();
   if (!isTypeLegal(Ty, VT))
     return false;
 
@@ -1309,7 +1326,7 @@ bool ARMFastISel::SelectSIToFP(const Instruction *I) {
   if (!Subtarget->hasVFP2()) return false;
 
   MVT DstVT;
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   if (!isTypeLegal(Ty, DstVT))
     return false;
 
@@ -1328,7 +1345,7 @@ bool ARMFastISel::SelectSIToFP(const Instruction *I) {
   unsigned Opc;
   if (Ty->isFloatTy()) Opc = ARM::VSITOS;
   else if (Ty->isDoubleTy()) Opc = ARM::VSITOD;
-  else return 0;
+  else return false;
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc),
@@ -1343,7 +1360,7 @@ bool ARMFastISel::SelectFPToSI(const Instruction *I) {
   if (!Subtarget->hasVFP2()) return false;
 
   MVT DstVT;
-  const Type *RetTy = I->getType();
+  Type *RetTy = I->getType();
   if (!isTypeLegal(RetTy, DstVT))
     return false;
 
@@ -1351,10 +1368,10 @@ bool ARMFastISel::SelectFPToSI(const Instruction *I) {
   if (Op == 0) return false;
 
   unsigned Opc;
-  const Type *OpTy = I->getOperand(0)->getType();
+  Type *OpTy = I->getOperand(0)->getType();
   if (OpTy->isFloatTy()) Opc = ARM::VTOSIZS;
   else if (OpTy->isDoubleTy()) Opc = ARM::VTOSIZD;
-  else return 0;
+  else return false;
 
   // f64->s32 or f32->s32 both need an intermediate f32 reg.
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
@@ -1401,7 +1418,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
 
 bool ARMFastISel::SelectSDiv(const Instruction *I) {
   MVT VT;
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   if (!isTypeLegal(Ty, VT))
     return false;
 
@@ -1429,7 +1446,7 @@ bool ARMFastISel::SelectSDiv(const Instruction *I) {
 
 bool ARMFastISel::SelectSRem(const Instruction *I) {
   MVT VT;
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   if (!isTypeLegal(Ty, VT))
     return false;
 
@@ -1456,7 +1473,7 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
   // operations, but can't figure out how to. Just use the vfp instructions
   // if we have them.
   // FIXME: It'd be nice to use NEON instructions.
-  const Type *Ty = I->getType();
+  Type *Ty = I->getType();
   bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
   if (isFloat && !Subtarget->hasVFP2())
     return false;
@@ -1711,7 +1728,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
-    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, I->getContext());
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,I->getContext());
     CCInfo.AnalyzeReturn(Outs, CCAssignFnForCall(CC, true /* is Ret */));
 
     const Value *RV = Ret->getOperand(0);
@@ -1778,7 +1795,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   CallingConv::ID CC = TLI.getLibcallCallingConv(Call);
 
   // Handle *simple* calls for now.
-  const Type *RetTy = I->getType();
+  Type *RetTy = I->getType();
   MVT RetVT;
   if (RetTy->isVoidTy())
     RetVT = MVT::isVoid;
@@ -1802,7 +1819,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     unsigned Arg = getRegForValue(Op);
     if (Arg == 0) return false;
 
-    const Type *ArgTy = Op->getType();
+    Type *ArgTy = Op->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT)) return false;
 
@@ -1870,13 +1887,13 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
   // TODO: Avoid some calling conventions?
 
   // Let SDISel handle vararg functions.
-  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   if (FTy->isVarArg())
     return false;
 
   // Handle *simple* calls for now.
-  const Type *RetTy = I->getType();
+  Type *RetTy = I->getType();
   MVT RetVT;
   if (RetTy->isVoidTy())
     RetVT = MVT::isVoid;
@@ -1915,7 +1932,7 @@ bool ARMFastISel::SelectCall(const Instruction *I) {
         CS.paramHasAttr(AttrInd, Attribute::ByVal))
       return false;
 
-    const Type *ArgTy = (*i)->getType();
+    Type *ArgTy = (*i)->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
@@ -1969,9 +1986,9 @@ bool ARMFastISel::SelectIntCast(const Instruction *I) {
   // On ARM, in general, integer casts don't involve legal types; this code
   // handles promotable integers.  The high bits for a type smaller than
   // the register size are assumed to be undefined.
-  const Type *DestTy = I->getType();
+  Type *DestTy = I->getType();
   Value *Op = I->getOperand(0);
-  const Type *SrcTy = Op->getType();
+  Type *SrcTy = Op->getType();
 
   EVT SrcVT, DestVT;
   SrcVT = TLI.getValueType(SrcTy, true);
@@ -2002,16 +2019,18 @@ bool ARMFastISel::SelectIntCast(const Instruction *I) {
   switch (SrcVT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i16:
+    if (!Subtarget->hasV6Ops()) return false;
     if (isZext)
-      Opc = isThumb ? ARM::t2UXTHr : ARM::UXTHr;
+      Opc = isThumb ? ARM::t2UXTH : ARM::UXTH;
     else
-      Opc = isThumb ? ARM::t2SXTHr : ARM::SXTHr;
+      Opc = isThumb ? ARM::t2SXTH : ARM::SXTH;
     break;
   case MVT::i8:
+    if (!Subtarget->hasV6Ops()) return false;
     if (isZext)
-      Opc = isThumb ? ARM::t2UXTBr : ARM::UXTBr;
+      Opc = isThumb ? ARM::t2UXTB : ARM::UXTB;
     else
-      Opc = isThumb ? ARM::t2SXTBr : ARM::SXTBr;
+      Opc = isThumb ? ARM::t2SXTB : ARM::SXTB;
     break;
   case MVT::i1:
     if (isZext) {
@@ -2033,6 +2052,8 @@ bool ARMFastISel::SelectIntCast(const Instruction *I) {
         .addReg(SrcReg);
   if (isBoolZext)
     MIB.addImm(1);
+  else
+    MIB.addImm(0);
   AddOptionalDefs(MIB);
   UpdateValueMap(I, DestReg);
   return true;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 381b404..2d1de6f 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMFrameLowering.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -93,7 +93,8 @@ static bool isCSRestore(MachineInstr *MI,
         return false;
     return true;
   }
-  if ((MI->getOpcode() == ARM::LDR_POST ||
+  if ((MI->getOpcode() == ARM::LDR_POST_IMM ||
+       MI->getOpcode() == ARM::LDR_POST_REG ||
        MI->getOpcode() == ARM::t2LDR_POST) &&
       isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) &&
       MI->getOperand(1).getReg() == ARM::SP)
@@ -413,6 +414,9 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
         MIB.addExternalSymbol(JumpTarget.getSymbolName(),
                               JumpTarget.getTargetFlags());
       }
+
+      // Add the default predicate in Thumb mode.
+      if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0);
     } else if (RetOpcode == ARM::TCRETURNri) {
       BuildMI(MBB, MBBI, dl,
               TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)).
@@ -502,7 +506,7 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
         }
       }
     } else if (AFI->isThumb2Function()) {
-      // Use  add <rd>, sp, #<imm8> 
+      // Use  add <rd>, sp, #<imm8>
       //      ldr <rd>, [sp, #<imm8>]
       // if at all possible to save space.
       if (Offset >= 0 && (Offset & 3) == 0 && Offset <= 1020)
@@ -587,14 +591,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc),
                                         ARM::SP)
         .addReg(Regs[0].first, getKillRegState(Regs[0].second))
-        .addReg(ARM::SP).setMIFlags(MIFlags);
-      // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
-      // that refactoring is complete (eventually).
-      if (StrOpc == ARM::STR_PRE) {
-        MIB.addReg(0);
-        MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::sub, 4, ARM_AM::no_shift));
-      } else
-        MIB.addImm(-4);
+        .addReg(ARM::SP).setMIFlags(MIFlags)
+        .addImm(-4);
       AddDefaultPred(MIB);
     }
     Regs.clear();
@@ -651,8 +649,10 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
                        .addReg(ARM::SP));
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
-      if (DeleteRet)
+      if (DeleteRet) {
+        MIB->copyImplicitOps(&*MI);
         MI->eraseFromParent();
+      }
       MI = MIB;
     } else if (Regs.size() == 1) {
       // If we adjusted the reg to PC from LR above, switch it back here. We
@@ -665,7 +665,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
           .addReg(ARM::SP);
       // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
       // that refactoring is complete (eventually).
-      if (LdrOpc == ARM::LDR_POST) {
+      if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
         MIB.addReg(0);
         MIB.addImm(ARM_AM::getAM2Opc(ARM_AM::add, 4, ARM_AM::no_shift));
       } else
@@ -687,7 +687,8 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   unsigned PushOpc = AFI->isThumbFunction() ? ARM::t2STMDB_UPD : ARM::STMDB_UPD;
-  unsigned PushOneOpc = AFI->isThumbFunction() ? ARM::t2STR_PRE : ARM::STR_PRE;
+  unsigned PushOneOpc = AFI->isThumbFunction() ?
+    ARM::t2STR_PRE : ARM::STR_PRE_IMM;
   unsigned FltOpc = ARM::VSTMDDB_UPD;
   emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register,
                MachineInstr::FrameSetup);
@@ -711,7 +712,7 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
 
   unsigned PopOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD;
-  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST : ARM::LDR_POST;
+  unsigned LdrOpc = AFI->isThumbFunction() ? ARM::t2LDR_POST :ARM::LDR_POST_IMM;
   unsigned FltOpc = ARM::VLDMDIA_UPD;
   emitPopInst(MBB, MI, CSI, FltOpc, 0, isVarArg, true, &isARMArea3Register);
   emitPopInst(MBB, MI, CSI, PopOpc, LdrOpc, isVarArg, false,
diff --git a/lib/Target/ARM/ARMGlobalMerge.cpp b/lib/Target/ARM/ARMGlobalMerge.cpp
index 8d77b2d..5f863ea 100644
--- a/lib/Target/ARM/ARMGlobalMerge.cpp
+++ b/lib/Target/ARM/ARMGlobalMerge.cpp
@@ -100,8 +100,8 @@ namespace {
       GlobalCmp(const TargetData *td) : TD(td) { }
 
       bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
-        const Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
-        const Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
+        Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
+        Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
 
         return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2));
       }
@@ -123,7 +123,7 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   // FIXME: Find better heuristics
   std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD));
 
-  const Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
 
   for (size_t i = 0, e = Globals.size(); i != e; ) {
     size_t j = 0;
@@ -150,7 +150,7 @@ bool ARMGlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
         ConstantInt::get(Int32Ty, 0),
         ConstantInt::get(Int32Ty, k-i)
       };
-      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx, 2);
+      Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(MergedGV, Idx);
       Globals[k]->replaceAllUsesWith(GEP);
       Globals[k]->eraseFromParent();
     }
@@ -176,7 +176,7 @@ bool ARMGlobalMerge::doInitialization(Module &M) {
 
     // Ignore fancy-aligned globals for now.
     unsigned Alignment = I->getAlignment();
-    const Type *Ty = I->getType()->getElementType();
+    Type *Ty = I->getType()->getElementType();
     if (Alignment > TD->getABITypeAlignment(Ty))
       continue;
 
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 2c9481b..5ee009c 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -14,8 +14,8 @@
 #define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
-#include "ARMAddressingModes.h"
 #include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -47,6 +47,11 @@ CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
   cl::desc("Check fp vmla / vmls hazard at isel time"),
   cl::init(true));
 
+static cl::opt<bool>
+DisableARMIntABS("disable-arm-int-abs", cl::Hidden,
+  cl::desc("Enable / disable ARM integer abs transform"),
+  cl::init(false));
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -90,13 +95,20 @@ public:
   bool hasNoVMLxHazardUse(SDNode *N) const;
   bool isShifterOpProfitable(const SDValue &Shift,
                              ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
-  bool SelectShifterOperandReg(SDValue N, SDValue &A,
+  bool SelectRegShifterOperand(SDValue N, SDValue &A,
                                SDValue &B, SDValue &C,
                                bool CheckProfitability = true);
-  bool SelectShiftShifterOperandReg(SDValue N, SDValue &A,
+  bool SelectImmShifterOperand(SDValue N, SDValue &A,
+                               SDValue &B, bool CheckProfitability = true);
+  bool SelectShiftRegShifterOperand(SDValue N, SDValue &A,
                                     SDValue &B, SDValue &C) {
     // Don't apply the profitability check
-    return SelectShifterOperandReg(N, A, B, C, false);
+    return SelectRegShifterOperand(N, A, B, C, false);
+  }
+  bool SelectShiftImmShifterOperand(SDValue N, SDValue &A,
+                                    SDValue &B) {
+    // Don't apply the profitability check
+    return SelectImmShifterOperand(N, A, B, false);
   }
 
   bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
@@ -122,8 +134,13 @@ public:
     return true;
   }
 
-  bool SelectAddrMode2Offset(SDNode *Op, SDValue N,
+  bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrOffsetNone(SDValue N, SDValue &Base);
   bool SelectAddrMode3(SDValue N, SDValue &Base,
                        SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
@@ -240,8 +257,13 @@ private:
                                ARMCC::CondCodes CCVal, SDValue CCR,
                                SDValue InFlag);
 
+  // Select special operations if node forms integer ABS pattern
+  SDNode *SelectABSOp(SDNode *N);
+
   SDNode *SelectConcatVector(SDNode *N);
 
+  SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
@@ -291,10 +313,10 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
 /// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax).
 ///
 /// \param ScaledConstant [out] - On success, the pre-scaled constant value.
-static bool isScaledConstantInRange(SDValue Node, unsigned Scale,
+static bool isScaledConstantInRange(SDValue Node, int Scale,
                                     int RangeMin, int RangeMax,
                                     int &ScaledConstant) {
-  assert(Scale && "Invalid scale!");
+  assert(Scale > 0 && "Invalid scale!");
 
   // Check that this is a constant.
   const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Node);
@@ -365,7 +387,30 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
   return ShOpcVal == ARM_AM::lsl && ShAmt == 2;
 }
 
-bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
+bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &Opc,
+                                              bool CheckProfitability) {
+  if (DisableShifterOp)
+    return false;
+
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS) return false;
+  ShImmVal = RHS->getZExtValue() & 31;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
                                               SDValue &BaseReg,
                                               SDValue &ShReg,
                                               SDValue &Opc,
@@ -373,7 +418,7 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
   if (DisableShifterOp)
     return false;
 
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
 
   // Don't match base register only case. That is matched to a separate
   // lower complexity pattern with explicit register operand.
@@ -381,19 +426,18 @@ bool ARMDAGToDAGISel::SelectShifterOperandReg(SDValue N,
 
   BaseReg = N.getOperand(0);
   unsigned ShImmVal = 0;
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    ShReg = CurDAG->getRegister(0, MVT::i32);
-    ShImmVal = RHS->getZExtValue() & 31;
-  } else {
-    ShReg = N.getOperand(1);
-    if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
-      return false;
-  }
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (RHS) return false;
+
+  ShReg = N.getOperand(1);
+  if (CheckProfitability && !isShifterOpProfitable(N, ShOpcVal, ShImmVal))
+    return false;
   Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
                                   MVT::i32);
   return true;
 }
 
+
 bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
                                           SDValue &Base,
                                           SDValue &OffImm) {
@@ -483,13 +527,10 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
       return false;
   }
 
-  if (Subtarget->isCortexA9() && !N.hasOneUse())
-    // Compute R +/- (R << N) and reuse it.
-    return false;
-
   // Otherwise this is R +/- [possibly shifted] R.
   ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::SUB ? ARM_AM::sub:ARM_AM::add;
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
   unsigned ShAmt = 0;
 
   Base   = N.getOperand(0);
@@ -515,16 +556,14 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
       !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
-    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
       // fold it.
       if (ConstantSDNode *Sh =
           dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
         ShAmt = Sh->getZExtValue();
-        if (!Subtarget->isCortexA9() ||
-            (N.hasOneUse() &&
-             isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) {
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
           Offset = N.getOperand(0).getOperand(0);
           Base = N.getOperand(1);
         } else {
@@ -630,7 +669,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
 
   // Otherwise this is R +/- [possibly shifted] R.
   ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  ARM_AM::ShiftOpc ShOpcVal =
+    ARM_AM::getShiftOpcForNode(N.getOperand(1).getOpcode());
   unsigned ShAmt = 0;
 
   Base   = N.getOperand(0);
@@ -656,16 +696,14 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
       !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
-    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
       // fold it.
       if (ConstantSDNode *Sh =
           dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
         ShAmt = Sh->getZExtValue();
-        if (!Subtarget->isCortexA9() ||
-            (N.hasOneUse() &&
-             isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt))) {
+        if (isShifterOpProfitable(N.getOperand(0), ShOpcVal, ShAmt)) {
           Offset = N.getOperand(0).getOperand(0);
           Base = N.getOperand(1);
         } else {
@@ -683,7 +721,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
   return AM2_SHOP;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
                                             SDValue &Offset, SDValue &Opc) {
   unsigned Opcode = Op->getOpcode();
   ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
@@ -692,16 +730,11 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
   ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
     ? ARM_AM::add : ARM_AM::sub;
   int Val;
-  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
-    Offset = CurDAG->getRegister(0, MVT::i32);
-    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
-                                                      ARM_AM::no_shift),
-                                    MVT::i32);
-    return true;
-  }
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val))
+    return false;
 
   Offset = N;
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
   unsigned ShAmt = 0;
   if (ShOpcVal != ARM_AM::no_shift) {
     // Check to see if the RHS of the shift is a constant, if not, we can't fold
@@ -724,6 +757,50 @@ bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    if (AddSub == ARM_AM::sub) Val *= -1;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(Val, MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  int Val;
+  if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val)) { // 12 bits.
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) {
+  Base = N;
+  return true;
+}
 
 bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
                                       SDValue &Base, SDValue &Offset,
@@ -1079,7 +1156,7 @@ bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg,
   if (DisableShifterOp)
     return false;
 
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
 
   // Don't match base register only case. That is matched to a separate
   // lower complexity pattern with explicit register operand.
@@ -1208,21 +1285,15 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
       return false;
   }
 
-  if (Subtarget->isCortexA9() && !N.hasOneUse()) {
-    // Compute R + (R << [1,2,3]) and reuse it.
-    Base = N;
-    return false;
-  }
-
   // Look for (R + R) or (R + (R << [1,2,3])).
   unsigned ShAmt = 0;
   Base   = N.getOperand(0);
   OffReg = N.getOperand(1);
 
   // Swap if it is ((R << c) + R).
-  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg);
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg.getOpcode());
   if (ShOpcVal != ARM_AM::lsl) {
-    ShOpcVal = ARM_AM::getShiftOpcForNode(Base);
+    ShOpcVal = ARM_AM::getShiftOpcForNode(Base.getOpcode());
     if (ShOpcVal == ARM_AM::lsl)
       std::swap(Base, OffReg);
   }
@@ -1266,10 +1337,19 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
   bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
   unsigned Opcode = 0;
   bool Match = false;
-  if (LoadedVT == MVT::i32 &&
-      SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) {
-    Opcode = isPre ? ARM::LDR_PRE : ARM::LDR_POST;
+  if (LoadedVT == MVT::i32 && isPre &&
+      SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_PRE_IMM;
+    Match = true;
+  } else if (LoadedVT == MVT::i32 && !isPre &&
+      SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = ARM::LDR_POST_IMM;
     Match = true;
+  } else if (LoadedVT == MVT::i32 &&
+      SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = isPre ? ARM::LDR_PRE_REG : ARM::LDR_POST_REG;
+    Match = true;
+
   } else if (LoadedVT == MVT::i16 &&
              SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
     Match = true;
@@ -1283,20 +1363,37 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
         Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
       }
     } else {
-      if (SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) {
+      if (isPre &&
+          SelectAddrMode2OffsetImmPre(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_PRE_IMM;
+      } else if (!isPre &&
+                  SelectAddrMode2OffsetImm(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = ARM::LDRB_POST_IMM;
+      } else if (SelectAddrMode2OffsetReg(N, LD->getOffset(), Offset, AMOpc)) {
         Match = true;
-        Opcode = isPre ? ARM::LDRB_PRE : ARM::LDRB_POST;
+        Opcode = isPre ? ARM::LDRB_PRE_REG : ARM::LDRB_POST_REG;
       }
     }
   }
 
   if (Match) {
-    SDValue Chain = LD->getChain();
-    SDValue Base = LD->getBasePtr();
-    SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
-                     CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32,
-                                  MVT::Other, Ops, 6);
+    if (Opcode == ARM::LDR_PRE_IMM || Opcode == ARM::LDRB_PRE_IMM) {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops, 5);
+    } else {
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
+                       CurDAG->getRegister(0, MVT::i32), Chain };
+      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+                                    MVT::i32, MVT::Other, Ops, 6);
+    }
   }
 
   return NULL;
@@ -1966,7 +2063,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                 Srl_imm)) {
         assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
 
-        unsigned Width = CountTrailingOnes_32(And_imm);
+        // Note: The width operand is encoded as width-1.
+        unsigned Width = CountTrailingOnes_32(And_imm) - 1;
         unsigned LSB = Srl_imm;
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
@@ -1986,7 +2084,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
     unsigned Srl_imm = 0;
     if (isInt32Immediate(N->getOperand(1), Srl_imm)) {
       assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
-      unsigned Width = 32 - Srl_imm;
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = 32 - Srl_imm - 1;
       int LSB = Srl_imm - Shl_imm;
       if (LSB < 0)
         return NULL;
@@ -2034,10 +2133,16 @@ SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
   SDValue CPTmp0;
   SDValue CPTmp1;
   SDValue CPTmp2;
-  if (SelectShifterOperandReg(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
+  if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) {
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6);
+  }
+
+  if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
     SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
     SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCs, MVT::i32, Ops, 7);
+    return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7);
   }
   return 0;
 }
@@ -2198,6 +2303,56 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
   return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
 }
 
+/// Target-specific DAG combining for ISD::XOR.
+/// Target-independent combining lowers SELECT_CC nodes of the form
+/// select_cc setg[ge] X,  0,  X, -X
+/// select_cc setgt    X, -1,  X, -X
+/// select_cc setl[te] X,  0, -X,  X
+/// select_cc setlt    X,  1, -X,  X
+/// which represent Integer ABS into:
+/// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+/// ARM instruction selection detects the latter and matches it to
+/// ARM::ABS or ARM::t2ABS machine node.
+SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
+  SDValue XORSrc0 = N->getOperand(0);
+  SDValue XORSrc1 = N->getOperand(1);
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  if (DisableARMIntABS)
+    return NULL;
+
+  if (Subtarget->isThumb1Only())
+    return NULL;
+
+  if (XORSrc0.getOpcode() != ISD::ADD ||
+    XORSrc1.getOpcode() != ISD::SRA)
+    return NULL;
+
+  SDValue ADDSrc0 = XORSrc0.getOperand(0);
+  SDValue ADDSrc1 = XORSrc0.getOperand(1);
+  SDValue SRASrc0 = XORSrc1.getOperand(0);
+  SDValue SRASrc1 = XORSrc1.getOperand(1);
+  ConstantSDNode *SRAConstant =  dyn_cast<ConstantSDNode>(SRASrc1);
+  EVT XType = SRASrc0.getValueType();
+  unsigned Size = XType.getSizeInBits() - 1;
+
+  if (ADDSrc1 == XORSrc1  &&
+      ADDSrc0 == SRASrc0 &&
+      XType.isInteger() &&
+      SRAConstant != NULL &&
+      Size == SRAConstant->getZExtValue()) {
+
+    unsigned Opcode = ARM::ABS;
+    if (Subtarget->isThumb2())
+      Opcode = ARM::t2ABS;
+
+    return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+  }
+
+  return NULL;
+}
+
 SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   // The only time a CONCAT_VECTORS operation can have legal types is when
   // two 64-bit vectors are concatenated to a 128-bit vector.
@@ -2207,6 +2362,25 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   return PairDRegs(VT, N->getOperand(0), N->getOperand(1));
 }
 
+SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Node->getOperand(1)); // Ptr
+  Ops.push_back(Node->getOperand(2)); // Low part of Val1
+  Ops.push_back(Node->getOperand(3)); // High part of Val1
+  if (Opc == ARM::ATOMCMPXCHG6432) {
+    Ops.push_back(Node->getOperand(4)); // Low part of Val2
+    Ops.push_back(Node->getOperand(5)); // High part of Val2
+  }
+  Ops.push_back(Node->getOperand(0)); // Chain
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           MVT::i32, MVT::i32, MVT::Other,
+                                           Ops.data() ,Ops.size());
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+  return ResNode;
+}
+
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
@@ -2215,6 +2389,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
+  case ISD::XOR: {
+    // Select special operations if XOR node forms integer ABS pattern
+    SDNode *ResNode = SelectABSOp(N);
+    if (ResNode)
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  }
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     bool UseCP = true;
@@ -2269,8 +2451,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
     if (Subtarget->isThumb1Only()) {
-      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI,
-                                  CurDAG->getTargetConstant(0, MVT::i32));
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops, 4);
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
@@ -2307,7 +2490,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
           return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops, 7);
         }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
@@ -2323,7 +2506,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
           return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops, 7);
         }
       }
     }
@@ -2986,6 +3169,23 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
+
+  case ARMISD::ATOMOR64_DAG:
+    return SelectAtomic64(N, ARM::ATOMOR6432);
+  case ARMISD::ATOMXOR64_DAG:
+    return SelectAtomic64(N, ARM::ATOMXOR6432);
+  case ARMISD::ATOMADD64_DAG:
+    return SelectAtomic64(N, ARM::ATOMADD6432);
+  case ARMISD::ATOMSUB64_DAG:
+    return SelectAtomic64(N, ARM::ATOMSUB6432);
+  case ARMISD::ATOMNAND64_DAG:
+    return SelectAtomic64(N, ARM::ATOMNAND6432);
+  case ARMISD::ATOMAND64_DAG:
+    return SelectAtomic64(N, ARM::ATOMAND6432);
+  case ARMISD::ATOMSWAP64_DAG:
+    return SelectAtomic64(N, ARM::ATOMSWAP6432);
+  case ARMISD::ATOMCMPXCHG64_DAG:
+    return SelectAtomic64(N, ARM::ATOMCMPXCHG6432);
   }
 
   return SelectCode(N);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index cf8c5ba..e44e356 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -14,7 +14,6 @@
 
 #define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMISelLowering.h"
@@ -24,6 +23,7 @@
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -106,7 +107,7 @@ void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
 
   EVT ElemTy = VT.getVectorElementType();
   if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
-    setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
   if (ElemTy != MVT::i32) {
     setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
@@ -178,6 +179,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   RegInfo = TM.getRegisterInfo();
   Itins = TM.getInstrItineraryData();
 
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
+
   if (Subtarget->isTargetDarwin()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
@@ -419,6 +422,13 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
   }
 
+  // Use divmod compiler-rt calls for iOS 5.0 and later.
+  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
+      !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
+    setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
+    setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
+  }
+
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
   else
@@ -453,7 +463,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
-    setOperationAction(ISD::VSETCC, MVT::v2f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
@@ -485,8 +495,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
-    setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
-    setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v2i64, Expand);
     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
     // a destination type that is wider than the source.
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
@@ -551,6 +561,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SRL,       MVT::i64, Custom);
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
 
+  if (!Subtarget->isThumb1Only()) {
+    // FIXME: We should do this for Thumb1 as well.
+    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
+    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
+    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
+  }
+
   // ARM does not have ROTL.
   setOperationAction(ISD::ROTL,  MVT::i32, Expand);
   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
@@ -596,62 +614,46 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
+  // FIXME: This should be checking for v6k, not just v6.
   if (Subtarget->hasDataBarrier() ||
       (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
     // membarrier needs custom lowering; the rest are legal and handled
     // normally.
     setOperationAction(ISD::MEMBARRIER, MVT::Other, Custom);
+    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+    // Custom lowering for 64-bit ops
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP,  MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
+    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+    setInsertFencesForAtomic(true);
   } else {
     // Set them all for expansion, which will force libcalls.
     setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i16, Expand);
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_SWAP,      MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i8,  Expand);
-    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i16, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
+    // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
+    // Unordered/Monotonic case.
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
     // Since the libcalls include locking, fold in the fences
     setShouldFoldAtomicFences(true);
   }
-  // 64-bit versions are always libcalls (for now)
-  setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_SWAP,      MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i64, Expand);
-  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Expand);
 
   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 
@@ -839,6 +841,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
   case ARMISD::RRX:           return "ARMISD::RRX";
 
+  case ARMISD::ADDC:          return "ARMISD::ADDC";
+  case ARMISD::ADDE:          return "ARMISD::ADDE";
+  case ARMISD::SUBC:          return "ARMISD::SUBC";
+  case ARMISD::SUBE:          return "ARMISD::SUBE";
+
   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
 
@@ -935,6 +942,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
+EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector()) return getPointerTy();
+  return VT.changeVectorElementTypeToInteger();
+}
+
 /// getRegClassFor - Return the register class that should be used for the
 /// specified value type.
 TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
@@ -1210,8 +1222,8 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsSibCall = false;
-  // Temporarily disable tail calls so things don't break.
-  if (!EnableARMTailCalls)
+  // Disable tail calls if they're not supported.
+  if (!EnableARMTailCalls && !Subtarget->supportsTailCall())
     isTailCall = false;
   if (isTailCall) {
     // Check if it's really possible to do a tail call.
@@ -1336,10 +1348,12 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       SDValue Src = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg, SrcOffset);
       SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset,
                                          MVT::i32);
+      // TODO: Disable AlwaysInline when it becomes possible
+      //       to emit a nested call sequence.
       MemOpChains.push_back(DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
                                           Flags.getByValAlign(),
                                           /*isVolatile=*/false,
-                                          /*AlwaysInline=*/false,
+                                          /*AlwaysInline=*/true,
                                           MachinePointerInfo(0),
                                           MachinePointerInfo(0)));
 
@@ -1404,9 +1418,9 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
       const GlobalValue *GV = G->getGlobal();
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
-                                                           ARMPCLabelIndex,
-                                                           ARMCP::CPValue, 0);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
+
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1419,8 +1433,9 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
 
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
-                                                       Sym, ARMPCLabelIndex, 0);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 0);
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -1441,9 +1456,8 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     // tBX takes a register source operand.
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
-                                                           ARMPCLabelIndex,
-                                                           ARMCP::CPValue, 4);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
@@ -1470,8 +1484,9 @@ ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
     const char *Sym = S->getSymbol();
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
-                                                       Sym, ARMPCLabelIndex, 4);
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
+                                      ARMPCLabelIndex, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(getPointerTy(), dl,
@@ -1940,9 +1955,9 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMPCLabelIndex = AFI->createPICLabelUId();
-    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex,
-                                                         ARMCP::CPBlockAddress,
-                                                         PCAdj);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
+                                      ARMCP::CPBlockAddress, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
@@ -1966,8 +1981,8 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   ARMConstantPoolValue *CPV =
-    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                             ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
+    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
   Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
@@ -1982,11 +1997,11 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   ArgListTy Args;
   ArgListEntry Entry;
   Entry.Node = Argument;
-  Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
+  Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
   // FIXME: is there useful debug info available here?
   std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()),
+    LowerCallTo(Chain, (Type *) Type::getInt32Ty(*DAG.getContext()),
                 false, false, false, false,
                 0, CallingConv::C, false, /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
@@ -2013,8 +2028,9 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     // Initial exec model.
     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
-                               ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true);
+      ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                      ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
+                                      true);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
@@ -2030,7 +2046,8 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                          false, false, 0);
   } else {
     // local exec model
-    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, ARMCP::TPOFF);
+    ARMConstantPoolValue *CPV =
+      ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
@@ -2066,7 +2083,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
   if (RelocM == Reloc::PIC_) {
     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+      ARMConstantPoolConstant::Create(GV,
+                                      UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
     SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
@@ -2135,7 +2153,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
     ARMPCLabelIndex = AFI->createPICLabelUId();
     unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
+      ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue,
+                                      PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
@@ -2167,9 +2186,9 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
-  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
-                                                       "_GLOBAL_OFFSET_TABLE_",
-                                                       ARMPCLabelIndex, PCAdj);
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
+                                  ARMPCLabelIndex, PCAdj);
   SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
   SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -2191,7 +2210,8 @@ SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Val = DAG.getConstant(0, MVT::i32);
-  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(0),
+  return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
+                     DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
                      Op.getOperand(1), Val);
 }
 
@@ -2224,8 +2244,8 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     unsigned PCAdj = (RelocM != Reloc::PIC_)
       ? 0 : (Subtarget->isThumb() ? 4 : 8);
     ARMConstantPoolValue *CPV =
-      new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex,
-                               ARMCP::CPLSDA, PCAdj);
+      ARMConstantPoolConstant::Create(MF.getFunction(), ARMPCLabelIndex,
+                                      ARMCP::CPLSDA, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result =
@@ -2277,6 +2297,25 @@ static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
                      DAG.getConstant(DMBOpt, MVT::i32));
 }
 
+
+static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
+                                 const ARMSubtarget *Subtarget) {
+  // FIXME: handle "fence singlethread" more efficiently.
+  DebugLoc dl = Op.getDebugLoc();
+  if (!Subtarget->hasDataBarrier()) {
+    // Some ARMv6 cpus can support data barriers with an mcr instruction.
+    // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
+    // here.
+    assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
+           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+    return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
+                       DAG.getConstant(0, MVT::i32));
+  }
+
+  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(ARM_MB::ISH, MVT::i32));
+}
+
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
                              const ARMSubtarget *Subtarget) {
   // ARM pre v5TE and Thumb1 does not have preload instructions.
@@ -2754,7 +2793,7 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
-    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
@@ -2993,8 +3032,8 @@ static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
 
-  EVT OperandVT = Op.getOperand(0).getValueType();
-  assert(OperandVT == MVT::v4i16 && "Invalid type for custom lowering!");
+  assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
+         "Invalid type for custom lowering!");
   if (VT != MVT::v4f32)
     return DAG.UnrollVectorOp(Op.getNode());
 
@@ -3905,8 +3944,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       }
 
       // Try an immediate VMVN.
-      uint64_t NegatedImm = (SplatBits.getZExtValue() ^
-                             ((1LL << SplatBitSize) - 1));
+      uint64_t NegatedImm = (~SplatBits).getZExtValue();
       Val = isNEONModifiedImm(NegatedImm,
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, VmovVT, VT.is128BitVector(),
@@ -4019,6 +4057,14 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
       // A shuffle can only come from building a vector from various
       // elements of other vectors.
       return SDValue();
+    } else if (V.getOperand(0).getValueType().getVectorElementType() !=
+               VT.getVectorElementType()) {
+      // This code doesn't know how to handle shuffles where the vector
+      // element types do not match (this happens because type legalization
+      // promotes the return type of EXTRACT_VECTOR_ELT).
+      // FIXME: It might be appropriate to extend this code to handle
+      // mismatched types.
+      return SDValue();
     }
 
     // Record this extraction against the appropriate vector if possible...
@@ -4819,6 +4865,71 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   return N0;
 }
 
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getNode()->getValueType(0);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
+
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Invalid code");
+  case ISD::ADDC: Opc = ARMISD::ADDC; break;
+  case ISD::ADDE: Opc = ARMISD::ADDE; ExtraOp = true; break;
+  case ISD::SUBC: Opc = ARMISD::SUBC; break;
+  case ISD::SUBE: Opc = ARMISD::SUBE; ExtraOp = true; break;
+  }
+
+  if (!ExtraOp)
+    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                       Op.getOperand(1));
+  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+                     Op.getOperand(1), Op.getOperand(2));
+}
+
+static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
+  // Monotonic load/store is legal for all targets
+  if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
+    return Op;
+
+  // Aquire/Release load/store is not legal for targets without a
+  // dmb or equivalent available.
+  return SDValue();
+}
+
+
+static void
+ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
+                    SelectionDAG &DAG, unsigned NewOp) {
+  DebugLoc dl = Node->getDebugLoc();
+  assert (Node->getValueType(0) == MVT::i64 &&
+          "Only know how to expand i64 atomics");
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(Node->getOperand(0)); // Chain
+  Ops.push_back(Node->getOperand(1)); // Ptr
+  // Low part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
+  // High part of Val1
+  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
+  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
+    // High part of Val1
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
+    // High part of Val2
+    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
+  }
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result =
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
+                            cast<MemSDNode>(Node)->getMemOperand());
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(Result.getValue(2));
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -4834,6 +4945,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG);
   case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
+  case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
@@ -4856,7 +4968,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
-  case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
+  case ISD::SETCC:         return LowerVSETCC(Op, DAG);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
@@ -4865,6 +4977,12 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:          return LowerSDIV(Op, DAG);
   case ISD::UDIV:          return LowerUDIV(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   }
   return SDValue();
 }
@@ -4886,6 +5004,30 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
     break;
+  case ISD::ATOMIC_LOAD_ADD:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_AND:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_NAND:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_OR:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_SUB:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_XOR:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
+    return;
+  case ISD::ATOMIC_SWAP:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
+    return;
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
+    return;
   }
   if (Res.getNode())
     Results.push_back(Res);
@@ -4963,7 +5105,10 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   //   cmp dest, oldval
   //   bne exitMBB
   BB = loop1MBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
                  .addReg(dest).addReg(oldval));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -4976,8 +5121,10 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   //   cmp scratch, #0
   //   bne loop1MBB
   BB = loop2MBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval)
-                 .addReg(ptr));
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                  .addReg(scratch).addImm(0));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -5063,7 +5210,10 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   //   bne- loopMBB
   //   fallthrough --> exitMBB
   BB = loopMBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   if (BinOpcode) {
     // operand order needs to go the other way for NAND
     if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
@@ -5074,8 +5224,10 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                      addReg(dest).addReg(incr)).addReg(0);
   }
 
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
-                 .addReg(ptr));
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                  .addReg(scratch).addImm(0));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -5125,12 +5277,12 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   case 1:
     ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
     strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    extendOpc = isThumb2 ? ARM::t2SXTBr : ARM::SXTBr;
+    extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
     break;
   case 2:
     ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
     strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    extendOpc = isThumb2 ? ARM::t2SXTHr : ARM::SXTHr;
+    extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
     break;
   case 4:
     ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
@@ -5170,12 +5322,17 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   //   bne- loopMBB
   //   fallthrough --> exitMBB
   BB = loopMBB;
-  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+  if (ldrOpc == ARM::t2LDREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
     oldval = MRI.createVirtualRegister(ARM::GPRRegisterClass);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval).addReg(dest));
+    AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
+                     .addReg(dest)
+                     .addImm(0));
   }
 
   // Build compare and cmov instructions.
@@ -5184,8 +5341,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
          .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR);
 
-  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
-                 .addReg(ptr));
+  MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
+  if (strOpc == ARM::t2STREX)
+    MIB.addImm(0);
+  AddDefaultPred(MIB);
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
                  .addReg(scratch).addImm(0));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
@@ -5203,79 +5362,596 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   return BB;
 }
 
-static
-MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I != Succ)
-      return *I;
-  llvm_unreachable("Expecting a BB with two successors!");
-}
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
+                                      unsigned Op1, unsigned Op2,
+                                      bool NeedsCarry, bool IsCmpxchg) const {
+  // This also handles ATOMIC_SWAP, indicated by Op1==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
-// FIXME: This opcode table should obviously be expressed in the target
-// description. We probably just need a "machine opcode" value in the pseudo
-// instruction. But the ideal solution maybe to simply remove the "S" version
-// of the opcode altogether.
-struct AddSubFlagsOpcodePair {
-  unsigned PseudoOpc;
-  unsigned MachineOpc;
-};
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
 
-static AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
-  {ARM::ADCSri, ARM::ADCri},
-  {ARM::ADCSrr, ARM::ADCrr},
-  {ARM::ADCSrs, ARM::ADCrs},
-  {ARM::SBCSri, ARM::SBCri},
-  {ARM::SBCSrr, ARM::SBCrr},
-  {ARM::SBCSrs, ARM::SBCrs},
-  {ARM::RSBSri, ARM::RSBri},
-  {ARM::RSBSrr, ARM::RSBrr},
-  {ARM::RSBSrs, ARM::RSBrs},
-  {ARM::RSCSri, ARM::RSCri},
-  {ARM::RSCSrs, ARM::RSCrs},
-  {ARM::t2ADCSri, ARM::t2ADCri},
-  {ARM::t2ADCSrr, ARM::t2ADCrr},
-  {ARM::t2ADCSrs, ARM::t2ADCrs},
-  {ARM::t2SBCSri, ARM::t2SBCri},
-  {ARM::t2SBCSrr, ARM::t2SBCrr},
-  {ARM::t2SBCSrs, ARM::t2SBCrs},
-  {ARM::t2RSBSri, ARM::t2RSBri},
-  {ARM::t2RSBSrs, ARM::t2RSBrs},
-};
+  unsigned destlo = MI->getOperand(0).getReg();
+  unsigned desthi = MI->getOperand(1).getReg();
+  unsigned ptr = MI->getOperand(2).getReg();
+  unsigned vallo = MI->getOperand(3).getReg();
+  unsigned valhi = MI->getOperand(4).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
 
-// Convert and Add or Subtract with Carry and Flags to a generic opcode with
-// CPSR<def> operand. e.g. ADCS (...) -> ADC (... CPSR<def>).
-//
-// FIXME: Somewhere we should assert that CPSR<def> is in the correct
-// position to be recognized by the target descrition as the 'S' bit.
-bool ARMTargetLowering::RemapAddSubWithFlags(MachineInstr *MI,
-                                             MachineBasicBlock *BB) const {
-  unsigned OldOpc = MI->getOpcode();
-  unsigned NewOpc = 0;
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(destlo, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(desthi, ARM::rGPRRegisterClass);
+    MRI.constrainRegClass(ptr, ARM::rGPRRegisterClass);
+  }
 
-  // This is only called for instructions that need remapping, so iterating over
-  // the tiny opcode table is not costly.
-  static const int NPairs =
-    sizeof(AddSubFlagsOpcodeMap) / sizeof(AddSubFlagsOpcodePair);
-  for (AddSubFlagsOpcodePair *Pair = &AddSubFlagsOpcodeMap[0],
-         *End = &AddSubFlagsOpcodeMap[NPairs]; Pair != End; ++Pair) {
-    if (OldOpc == Pair->PseudoOpc) {
-      NewOpc = Pair->MachineOpc;
-      break;
+  unsigned ldrOpc = isThumb2 ? ARM::t2LDREXD : ARM::LDREXD;
+  unsigned strOpc = isThumb2 ? ARM::t2STREXD : ARM::STREXD;
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *contBB = 0, *cont2BB = 0;
+  if (IsCmpxchg) {
+    contBB = MF->CreateMachineBasicBlock(LLVM_BB);
+    cont2BB = MF->CreateMachineBasicBlock(LLVM_BB);
+  }
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  if (IsCmpxchg) {
+    MF->insert(It, contBB);
+    MF->insert(It, cont2BB);
+  }
+  MF->insert(It, exitMBB);
+
+  // Transfer the remainder of BB and its successor edges to exitMBB.
+  exitMBB->splice(exitMBB->begin(), BB,
+                  llvm::next(MachineBasicBlock::iterator(MI)),
+                  BB->end());
+  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  TargetRegisterClass *TRC =
+    isThumb2 ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+  unsigned storesuccess = MRI.createVirtualRegister(TRC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrexd r2, r3, ptr
+  //   <binopa> r0, r2, incr
+  //   <binopb> r1, r3, incr
+  //   strexd storesuccess, r0, r1, ptr
+  //   cmp storesuccess, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  //
+  // Note that the registers are explicitly specified because there is not any
+  // way to force the register allocator to allocate a register pair.
+  //
+  // FIXME: The hardcoded registers are not necessary for Thumb2, but we
+  // need to properly enforce the restriction that the two output registers
+  // for ldrexd must be different.
+  BB = loopMBB;
+  // Load
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                 .addReg(ARM::R2, RegState::Define)
+                 .addReg(ARM::R3, RegState::Define).addReg(ptr));
+  // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo).addReg(ARM::R2);
+  BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi).addReg(ARM::R3);
+
+  if (IsCmpxchg) {
+    // Add early exit
+    for (unsigned i = 0; i < 2; i++) {
+      AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr :
+                                                         ARM::CMPrr))
+                     .addReg(i == 0 ? destlo : desthi)
+                     .addReg(i == 0 ? vallo : valhi));
+      BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+        .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+      BB->addSuccessor(exitMBB);
+      BB->addSuccessor(i == 0 ? contBB : cont2BB);
+      BB = (i == 0 ? contBB : cont2BB);
     }
+
+    // Copy to physregs for strexd
+    unsigned setlo = MI->getOperand(5).getReg();
+    unsigned sethi = MI->getOperand(6).getReg();
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(setlo);
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(sethi);
+  } else if (Op1) {
+    // Perform binary operation
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op1), ARM::R0)
+                   .addReg(destlo).addReg(vallo))
+        .addReg(NeedsCarry ? ARM::CPSR : 0, getDefRegState(NeedsCarry));
+    AddDefaultPred(BuildMI(BB, dl, TII->get(Op2), ARM::R1)
+                   .addReg(desthi).addReg(valhi)).addReg(0);
+  } else {
+    // Copy to physregs for strexd
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R0).addReg(vallo);
+    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), ARM::R1).addReg(valhi);
   }
-  if (!NewOpc)
-    return false;
 
+  // Store
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
+                 .addReg(ARM::R0).addReg(ARM::R1).addReg(ptr));
+  // Cmp+jump
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(storesuccess).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
+/// EmitBasePointerRecalculation - For functions using a base pointer, we
+/// rematerialize it (via the frame pointer).
+void ARMTargetLowering::
+EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB,
+                             MachineBasicBlock *DispatchBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  MachineFunction &MF = *MI->getParent()->getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  if (!RI.hasBasePointer(MF)) return;
+
+  MachineBasicBlock::iterator MBBI = MI;
+
+  int32_t NumBytes = AFI->getFramePtrSpillOffset();
+  unsigned FramePtr = RI.getFrameRegister(MF);
+  assert(MF.getTarget().getFrameLowering()->hasFP(MF) &&
+         "Base pointer without frame pointer?");
+
+  if (AFI->isThumb2Function())
+    llvm::emitT2RegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
+                                 FramePtr, -NumBytes, ARMCC::AL, 0, *AII);
+  else if (AFI->isThumbFunction())
+    llvm::emitThumbRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
+                                    FramePtr, -NumBytes, *AII, RI);
+  else
+    llvm::emitARMRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
+                                  FramePtr, -NumBytes, ARMCC::AL, 0, *AII);
+
+  if (!RI.needsStackRealignment(MF)) return;
+
+  // If there's dynamic realignment, adjust for it.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  assert(!AFI->isThumb1OnlyFunction());
+
+  // Emit bic r6, r6, MaxAlign
+  unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri;
+  AddDefaultCC(
+    AddDefaultPred(
+      BuildMI(*MBB, MBBI, MI->getDebugLoc(), TII->get(bicOpc), ARM::R6)
+      .addReg(ARM::R6, RegState::Kill)
+      .addImm(MaxAlign - 1)));
+}
+
+/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
+/// registers the function context.
+void ARMTargetLowering::
+SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
+                       MachineBasicBlock *DispatchBB, int FI) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
-  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
-  for (unsigned i = 0; i < MI->getNumOperands(); ++i)
-    MIB.addOperand(MI->getOperand(i));
-  AddDefaultPred(MIB);
-  MIB.addReg(ARM::CPSR, RegState::Define); // S bit
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  MachineConstantPool *MCP = MF->getConstantPool();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  const Function *F = MF->getFunction();
+
+  bool isThumb = Subtarget->isThumb();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  unsigned PCLabelId = AFI->createPICLabelUId();
+  unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
+  ARMConstantPoolValue *CPV =
+    ARMConstantPoolMBB::Create(F->getContext(), DispatchBB, PCLabelId, PCAdj);
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+
+  const TargetRegisterClass *TRC =
+    isThumb ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+
+  // Grab constant pool and fixed stack memory operands.
+  MachineMemOperand *CPMMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(),
+                             MachineMemOperand::MOLoad, 4, 4);
+
+  MachineMemOperand *FIMMOSt =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOStore, 4, 4);
+
+  EmitBasePointerRecalculation(MI, MBB, DispatchBB);
+
+  // Load the address of the dispatch MBB into the jump buffer.
+  if (isThumb2) {
+    // Incoming value: jbuf
+    //   ldr.n  r5, LCPI1_1
+    //   orr    r5, r5, #1
+    //   add    r5, pc
+    //   str    r5, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
+                     .addReg(NewVReg1, RegState::Kill)
+                     .addImm(0x01)));
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
+      .addReg(NewVReg2, RegState::Kill)
+      .addImm(PCLabelId);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
+                   .addReg(NewVReg3, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  } else if (isThumb) {
+    // Incoming value: jbuf
+    //   ldr.n  r1, LCPI1_4
+    //   add    r1, pc
+    //   mov    r2, #1
+    //   orrs   r1, r2
+    //   add    r2, $jbuf, #+4 ; &jbuf[1]
+    //   str    r1, [r2]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
+      .addReg(NewVReg1, RegState::Kill)
+      .addImm(PCLabelId);
+    // Set the low bit because of thumb mode.
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addImm(1));
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3, RegState::Kill));
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tADDrSPi), NewVReg5)
+                   .addFrameIndex(FI)
+                   .addImm(36)); // &jbuf[1] :: pc
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addReg(NewVReg5, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(FIMMOSt));
+  } else {
+    // Incoming value: jbuf
+    //   ldr  r1, LCPI1_1
+    //   add  r1, pc, r1
+    //   str  r1, [$jbuf, #+4] ; &jbuf[1]
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12),  NewVReg1)
+                   .addConstantPoolIndex(CPI)
+                   .addImm(0)
+                   .addMemOperand(CPMMO));
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
+                   .addReg(NewVReg1, RegState::Kill)
+                   .addImm(PCLabelId));
+    AddDefaultPred(BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addFrameIndex(FI)
+                   .addImm(36)  // &jbuf[1] :: pc
+                   .addMemOperand(FIMMOSt));
+  }
+}
+
+MachineBasicBlock *ARMTargetLowering::
+EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  int FI = MFI->getFunctionContextIndex();
+
+  const TargetRegisterClass *TRC =
+    Subtarget->isThumb() ? ARM::tGPRRegisterClass : ARM::GPRRegisterClass;
+
+  // Get a mapping of the call site numbers to all of the landing pads they're
+  // associated with.
+  DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2> > CallSiteNumToLPad;
+  unsigned MaxCSNum = 0;
+  MachineModuleInfo &MMI = MF->getMMI();
+  for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) {
+    if (!BB->isLandingPad()) continue;
+
+    // FIXME: We should assert that the EH_LABEL is the first MI in the landing
+    // pad.
+    for (MachineBasicBlock::iterator
+           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+      if (!II->isEHLabel()) continue;
+
+      MCSymbol *Sym = II->getOperand(0).getMCSymbol();
+      if (!MMI.hasCallSiteLandingPad(Sym)) continue;
+
+      SmallVectorImpl<unsigned> &CallSiteIdxs = MMI.getCallSiteLandingPad(Sym);
+      for (SmallVectorImpl<unsigned>::iterator
+             CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
+           CSI != CSE; ++CSI) {
+        CallSiteNumToLPad[*CSI].push_back(BB);
+        MaxCSNum = std::max(MaxCSNum, *CSI);
+      }
+      break;
+    }
+  }
+
+  // Get an ordered list of the machine basic blocks for the jump table.
+  std::vector<MachineBasicBlock*> LPadList;
+  SmallPtrSet<MachineBasicBlock*, 64> InvokeBBs;
+  LPadList.reserve(CallSiteNumToLPad.size());
+  for (unsigned I = 1; I <= MaxCSNum; ++I) {
+    SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
+    for (SmallVectorImpl<MachineBasicBlock*>::iterator
+           II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
+      LPadList.push_back(*II);
+      InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
+    }
+  }
+
+  assert(!LPadList.empty() &&
+         "No landing pad destinations for the dispatch jump table!");
+
+  // Create the jump table and associated information.
+  MachineJumpTableInfo *JTI =
+    MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
+  unsigned MJTI = JTI->createJumpTableIndex(LPadList);
+  unsigned UId = AFI->createJumpTableUId();
+
+  // Create the MBBs for the dispatch code.
+
+  // Shove the dispatch's address into the return slot in the function context.
+  MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
+  DispatchBB->setIsLandingPad();
+
+  MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+  BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
+  DispatchBB->addSuccessor(TrapBB);
+
+  MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
+  DispatchBB->addSuccessor(DispContBB);
+
+  // Insert and renumber MBBs.
+  MachineBasicBlock *Last = &MF->back();
+  MF->insert(MF->end(), DispatchBB);
+  MF->insert(MF->end(), DispContBB);
+  MF->insert(MF->end(), TrapBB);
+  MF->RenumberBlocks(Last);
+
+  // Insert code into the entry block that creates and registers the function
+  // context.
+  SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
+
+  MachineMemOperand *FIMMOLd =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOLoad |
+                             MachineMemOperand::MOVolatile, 4, 4);
+
+  if (Subtarget->isThumb2()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
+                   .addReg(NewVReg1)
+                   .addImm(LPadList.size()));
+    BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT),NewVReg2)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(
+        BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg3)
+        .addReg(NewVReg2, RegState::Kill)
+        .addReg(NewVReg1)
+        .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
+      .addReg(NewVReg3, RegState::Kill)
+      .addReg(NewVReg1)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else if (Subtarget->isThumb()) {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(1)
+                   .addMemOperand(FIMMOLd));
+
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
+                   .addReg(NewVReg1)
+                   .addImm(LPadList.size()));
+    BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg1)
+                   .addImm(2));
+
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg2, RegState::Kill)
+                   .addReg(NewVReg3));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+
+    unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
+                   .addReg(NewVReg4, RegState::Kill)
+                   .addImm(0)
+                   .addMemOperand(JTMMOLd));
+
+    unsigned NewVReg6 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
+                   .addReg(ARM::CPSR, RegState::Define)
+                   .addReg(NewVReg5, RegState::Kill)
+                   .addReg(NewVReg3));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
+      .addReg(NewVReg6, RegState::Kill)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  } else {
+    unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
+                   .addFrameIndex(FI)
+                   .addImm(4)
+                   .addMemOperand(FIMMOLd));
+    AddDefaultPred(BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
+                   .addReg(NewVReg1)
+                   .addImm(LPadList.size()));
+    BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
+      .addMBB(TrapBB)
+      .addImm(ARMCC::HI)
+      .addReg(ARM::CPSR);
+
+    unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
+    AddDefaultCC(
+      AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg2)
+                     .addReg(NewVReg1)
+                     .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))));
+    unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg3)
+                   .addJumpTableIndex(MJTI)
+                   .addImm(UId));
+
+    MachineMemOperand *JTMMOLd =
+      MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(),
+                               MachineMemOperand::MOLoad, 4, 4);
+    unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
+    AddDefaultPred(
+      BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg4)
+      .addReg(NewVReg2, RegState::Kill)
+      .addReg(NewVReg3)
+      .addImm(0)
+      .addMemOperand(JTMMOLd));
+
+    BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
+      .addReg(NewVReg4, RegState::Kill)
+      .addReg(NewVReg3)
+      .addJumpTableIndex(MJTI)
+      .addImm(UId);
+  }
+
+  // Add the jump table entries as successors to the MBB.
+  MachineBasicBlock *PrevMBB = 0;
+  for (std::vector<MachineBasicBlock*>::iterator
+         I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
+    MachineBasicBlock *CurMBB = *I;
+    if (PrevMBB != CurMBB)
+      DispContBB->addSuccessor(CurMBB);
+    PrevMBB = CurMBB;
+  }
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+  const unsigned *SavedRegs = RI.getCalleeSavedRegs(MF);
+  for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
+         I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
+    MachineBasicBlock *BB = *I;
+
+    // Remove the landing pad successor from the invoke block and replace it
+    // with the new dispatch block.
+    for (MachineBasicBlock::succ_iterator
+           SI = BB->succ_begin(), SE = BB->succ_end(); SI != SE; ++SI) {
+      MachineBasicBlock *SMBB = *SI;
+      if (SMBB->isLandingPad()) {
+        BB->removeSuccessor(SMBB);
+        SMBB->setIsLandingPad(false);
+      }
+    }
+
+    BB->addSuccessor(DispatchBB);
+
+    // Find the invoke call and mark all of the callee-saved registers as
+    // 'implicit defined' so that they're spilled. This prevents code from
+    // moving instructions to before the EH block, where they will never be
+    // executed.
+    for (MachineBasicBlock::reverse_iterator
+           II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
+      if (!II->getDesc().isCall()) continue;
+
+      DenseMap<unsigned, bool> DefRegs;
+      for (MachineInstr::mop_iterator
+             OI = II->operands_begin(), OE = II->operands_end();
+           OI != OE; ++OI) {
+        if (!OI->isReg()) continue;
+        DefRegs[OI->getReg()] = true;
+      }
+
+      MachineInstrBuilder MIB(&*II);
+
+      for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
+        if (!TRC->contains(SavedRegs[i])) continue;
+        if (!DefRegs[SavedRegs[i]])
+          MIB.addReg(SavedRegs[i], RegState::ImplicitDefine | RegState::Dead);
+      }
+
+      break;
+    }
+  }
+
+  // The instruction is gone now.
   MI->eraseFromParent();
-  return true;
+
+  return MBB;
+}
+
+static
+MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I != Succ)
+      return *I;
+  llvm_unreachable("Expecting a BB with two successors!");
 }
 
 MachineBasicBlock *
@@ -5286,12 +5962,61 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI->getOpcode()) {
   default: {
-    if (RemapAddSubWithFlags(MI, BB))
-      return BB;
-
     MI->dump();
     llvm_unreachable("Unexpected instr type to insert");
   }
+  // The Thumb2 pre-indexed stores have the same MI operands, they just
+  // define them differently in the .td files from the isel patterns, so
+  // they need pseudos.
+  case ARM::t2STR_preidx:
+    MI->setDesc(TII->get(ARM::t2STR_PRE));
+    return BB;
+  case ARM::t2STRB_preidx:
+    MI->setDesc(TII->get(ARM::t2STRB_PRE));
+    return BB;
+  case ARM::t2STRH_preidx:
+    MI->setDesc(TII->get(ARM::t2STRH_PRE));
+    return BB;
+
+  case ARM::STRi_preidx:
+  case ARM::STRBi_preidx: {
+    unsigned NewOpc = MI->getOpcode() == ARM::STRi_preidx ?
+      ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM;
+    // Decode the offset.
+    unsigned Offset = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
+    Offset = ARM_AM::getAM2Offset(Offset);
+    if (isSub)
+      Offset = -Offset;
+
+    MachineMemOperand *MMO = *MI->memoperands_begin();
+    BuildMI(*BB, MI, dl, TII->get(NewOpc))
+      .addOperand(MI->getOperand(0))  // Rn_wb
+      .addOperand(MI->getOperand(1))  // Rt
+      .addOperand(MI->getOperand(2))  // Rn
+      .addImm(Offset)                 // offset (skip GPR==zero_reg)
+      .addOperand(MI->getOperand(5))  // pred
+      .addOperand(MI->getOperand(6))
+      .addMemOperand(MMO);
+    MI->eraseFromParent();
+    return BB;
+  }
+  case ARM::STRr_preidx:
+  case ARM::STRBr_preidx:
+  case ARM::STRH_preidx: {
+    unsigned NewOpc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
+    case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
+    case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
+    }
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
+    for (unsigned i = 0; i < MI->getNumOperands(); ++i)
+      MIB.addOperand(MI->getOperand(i));
+    MI->eraseFromParent();
+    return BB;
+  }
   case ARM::ATOMIC_LOAD_ADD_I8:
      return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
   case ARM::ATOMIC_LOAD_ADD_I16:
@@ -5370,6 +6095,31 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
   case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
 
+
+  case ARM::ATOMADD6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
+                              isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
+                              /*NeedsCarry*/ true);
+  case ARM::ATOMSUB6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ true);
+  case ARM::ATOMOR6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
+                              isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMXOR6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
+                              isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMAND6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
+                              isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMSWAP6432:
+    return EmitAtomicBinary64(MI, BB, 0, 0, false);
+  case ARM::ATOMCMPXCHG6432:
+    return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
+                              isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
+                              /*NeedsCarry*/ false, /*IsCmpxchg*/true);
+
   case ARM::tMOVCCr_pseudo: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
@@ -5461,13 +6211,159 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
-    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2B : ARM::B))
-      .addMBB(exitMBB);
+    if (isThumb2)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2B)).addMBB(exitMBB));
+    else
+      BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
 
     MI->eraseFromParent();   // The pseudo instruction is gone now.
     return BB;
   }
+
+  case ARM::ABS:
+  case ARM::t2ABS: {
+    // To insert an ABS instruction, we have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // source vreg to test against 0, the destination vreg to set,
+    // the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use. 
+    // It transforms
+    //     V1 = ABS V0
+    // into
+    //     V2 = MOVS V0
+    //     BCC                      (branch to SinkBB if V0 >= 0)
+    //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
+    //     SinkBB: V1 = PHI(V2, V3)     
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator BBI = BB;
+    ++BBI;
+    MachineFunction *Fn = BB->getParent();
+    MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
+    Fn->insert(BBI, RSBBB);
+    Fn->insert(BBI, SinkBB);
+
+    unsigned int ABSSrcReg = MI->getOperand(1).getReg();
+    unsigned int ABSDstReg = MI->getOperand(0).getReg();
+    bool isThumb2 = Subtarget->isThumb2();
+    MachineRegisterInfo &MRI = Fn->getRegInfo();
+    // In Thumb mode S must not be specified if source register is the SP or
+    // PC and if destination register is the SP, so restrict register class
+    unsigned NewMovDstReg = MRI.createVirtualRegister(
+      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
+    unsigned NewRsbDstReg = MRI.createVirtualRegister(
+      isThumb2 ? ARM::rGPRRegisterClass : ARM::GPRRegisterClass);
+
+    // Transfer the remainder of BB and its successor edges to sinkMBB.
+    SinkBB->splice(SinkBB->begin(), BB,
+      llvm::next(MachineBasicBlock::iterator(MI)),
+      BB->end());
+    SinkBB->transferSuccessorsAndUpdatePHIs(BB);
+
+    BB->addSuccessor(RSBBB);
+    BB->addSuccessor(SinkBB);
+
+    // fall through to SinkMBB
+    RSBBB->addSuccessor(SinkBB);
+
+    // insert a movs at the end of BB
+    BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVr : ARM::MOVr),
+      NewMovDstReg)
+      .addReg(ABSSrcReg, RegState::Kill)
+      .addImm((unsigned)ARMCC::AL).addReg(0)
+      .addReg(ARM::CPSR, RegState::Define);
+
+    // insert a bcc with opposite CC to ARMCC::MI at the end of BB
+    BuildMI(BB, dl, 
+      TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
+      .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
+
+    // insert rsbri in RSBBB
+    // Note: BCC and rsbri will be converted into predicated rsbmi
+    // by if-conversion pass
+    BuildMI(*RSBBB, RSBBB->begin(), dl, 
+      TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
+      .addReg(NewMovDstReg, RegState::Kill)
+      .addImm(0).addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+
+    // insert PHI in SinkBB, 
+    // reuse ABSDstReg to not change uses of ABS instruction
+    BuildMI(*SinkBB, SinkBB->begin(), dl,
+      TII->get(ARM::PHI), ABSDstReg)
+      .addReg(NewRsbDstReg).addMBB(RSBBB)
+      .addReg(NewMovDstReg).addMBB(BB);
+
+    // remove ABS instruction
+    MI->eraseFromParent(); 
+
+    // return last added BB
+    return SinkBB;
+  }
+  }
+}
+
+void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                                      SDNode *Node) const {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MCID.hasPostISelHook()) {
+    assert(!convertAddSubFlagsOpcode(MI->getOpcode()) &&
+           "Pseudo flag-setting opcodes must be marked with 'hasPostISelHook'");
+    return;
+  }
+
+  // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
+  // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
+  // operand is still set to noreg. If needed, set the optional operand's
+  // register to CPSR, and remove the redundant implicit def.
+  //
+  // e.g. ADCS (...opt:%noreg, CPSR<imp-def>) -> ADC (... opt:CPSR<def>).
+
+  // Rename pseudo opcodes.
+  unsigned NewOpc = convertAddSubFlagsOpcode(MI->getOpcode());
+  if (NewOpc) {
+    const ARMBaseInstrInfo *TII =
+      static_cast<const ARMBaseInstrInfo*>(getTargetMachine().getInstrInfo());
+    MI->setDesc(TII->get(NewOpc));
+  }
+  unsigned ccOutIdx = MCID.getNumOperands() - 1;
+
+  // Any ARM instruction that sets the 's' bit should specify an optional
+  // "cc_out" operand in the last operand position.
+  if (!MCID.hasOptionalDef() || !MCID.OpInfo[ccOutIdx].isOptionalDef()) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
+  // since we already have an optional CPSR def.
+  bool definesCPSR = false;
+  bool deadCPSR = false;
+  for (unsigned i = MCID.getNumOperands(), e = MI->getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
+      definesCPSR = true;
+      if (MO.isDead())
+        deadCPSR = true;
+      MI->RemoveOperand(i);
+      break;
+    }
+  }
+  if (!definesCPSR) {
+    assert(!NewOpc && "Optional cc_out operand required");
+    return;
+  }
+  assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
+  if (deadCPSR) {
+    assert(!MI->getOperand(ccOutIdx).getReg() &&
+           "expect uninitialized optional cc_out operand");
+    return;
   }
+
+  // If this instruction was defined with an optional CPSR def and its dag node
+  // had a live implicit CPSR def, then activate the optional CPSR def.
+  MachineOperand &MO = MI->getOperand(ccOutIdx);
+  MO.setReg(ARM::CPSR);
+  MO.setIsDef(true);
 }
 
 //===----------------------------------------------------------------------===//
@@ -6975,7 +7871,8 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   SDValue FalseVal = N->getOperand(0);
   SDValue TrueVal = N->getOperand(1);
   SDValue ARMcc = N->getOperand(2);
-  ARMCC::CondCodes CC = (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
+  ARMCC::CondCodes CC =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
 
   // Simplify
   //   mov     r1, r0
@@ -6995,7 +7892,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   //   movne   r0, y
   /// FIXME: Turn this into a target neutral optimization?
   SDValue Res;
-  if (CC == ARMCC::NE && FalseVal == RHS) {
+  if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
                       N->getOperand(3), Cmp);
   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
@@ -7235,7 +8132,7 @@ bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   EVT VT = getValueType(Ty, true);
   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
     return false;
@@ -7351,7 +8248,8 @@ static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
 
     if (Ptr->getOpcode() == ISD::ADD) {
       isInc = true;
-      ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0));
+      ARM_AM::ShiftOpc ShOpcVal=
+        ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
       if (ShOpcVal != ARM_AM::no_shift) {
         Base = Ptr->getOperand(1);
         Offset = Ptr->getOperand(0);
@@ -7536,7 +8434,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
     if (AsmPieces.size() == 3 &&
         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
-      const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
       if (Ty && Ty->getBitWidth() == 32)
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
@@ -7559,6 +8457,9 @@ ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'x': return C_RegisterClass;
     case 't': return C_RegisterClass;
     case 'j': return C_Other; // Constant for movw.
+      // An address with a single base register. Due to the way we
+      // currently handle addresses it is the same as an 'r' memory constraint.
+    case 'Q': return C_Memory;
     }
   } else if (Constraint.size() == 2) {
     switch (Constraint[0]) {
@@ -7582,7 +8483,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
@@ -7618,7 +8519,7 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return RCPair(0U, ARM::GPRRegisterClass);
     case 'h': // High regs or no regs.
       if (Subtarget->isThumb())
-	return RCPair(0U, ARM::hGPRRegisterClass);
+        return RCPair(0U, ARM::hGPRRegisterClass);
       break;
     case 'r':
       return RCPair(0U, ARM::GPRRegisterClass);
@@ -7632,15 +8533,15 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       break;
     case 'x':
       if (VT == MVT::f32)
-	return RCPair(0U, ARM::SPR_8RegisterClass);
+        return RCPair(0U, ARM::SPR_8RegisterClass);
       if (VT.getSizeInBits() == 64)
-	return RCPair(0U, ARM::DPR_8RegisterClass);
+        return RCPair(0U, ARM::DPR_8RegisterClass);
       if (VT.getSizeInBits() == 128)
-	return RCPair(0U, ARM::QPR_8RegisterClass);
+        return RCPair(0U, ARM::QPR_8RegisterClass);
       break;
     case 't':
       if (VT == MVT::f32)
-	return RCPair(0U, ARM::SPRRegisterClass);
+        return RCPair(0U, ARM::SPRRegisterClass);
       break;
     }
   }
@@ -7680,12 +8581,12 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
     switch (ConstraintLetter) {
       case 'j':
-	// Constant suitable for movw, must be between 0 and
-	// 65535.
-	if (Subtarget->hasV6T2Ops())
-	  if (CVal >= 0 && CVal <= 65535)
-	    break;
-	return;
+        // Constant suitable for movw, must be between 0 and
+        // 65535.
+        if (Subtarget->hasV6T2Ops())
+          if (CVal >= 0 && CVal <= 65535)
+            break;
+        return;
       case 'I':
         if (Subtarget->isThumb1Only()) {
           // This must be a constant between 0 and 255, for ADD
@@ -7823,50 +8724,6 @@ ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   return false;
 }
 
-int ARM::getVFPf32Imm(const APFloat &FPImm) {
-  APInt Imm = FPImm.bitcastToAPInt();
-  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
-  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
-  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0x7ffff)
-    return -1;
-  Mantissa >>= 19;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
-int ARM::getVFPf64Imm(const APFloat &FPImm) {
-  APInt Imm = FPImm.bitcastToAPInt();
-  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
-  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
-  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffLL;
-
-  // We can handle 4 bits of mantissa.
-  // mantissa = (16+UInt(e:f:g:h))/16.
-  if (Mantissa & 0xffffffffffffLL)
-    return -1;
-  Mantissa >>= 48;
-  if ((Mantissa & 0xf) != Mantissa)
-    return -1;
-
-  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
-  if (Exp < -3 || Exp > 4)
-    return -1;
-  Exp = ((Exp+3) & 0x7) ^ 4;
-
-  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
-}
-
 bool ARM::isBitFieldInvertedMask(unsigned v) {
   if (v == 0xffffffff)
     return 0;
@@ -7889,9 +8746,9 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (!Subtarget->hasVFP3())
     return false;
   if (VT == MVT::f32)
-    return ARM::getVFPf32Imm(Imm) != -1;
+    return ARM_AM::getFP32Imm(Imm) != -1;
   if (VT == MVT::f64)
-    return ARM::getVFPf64Imm(Imm) != -1;
+    return ARM_AM::getFP64Imm(Imm) != -1;
   return false;
 }
 
@@ -7933,7 +8790,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      const Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
       NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 980fb40..5da9b27 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -71,6 +71,11 @@ namespace llvm {
       SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
       RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
 
+      ADDC,         // Add with carry
+      ADDE,         // Add using carry
+      SUBC,         // Sub with carry
+      SUBE,         // Sub using carry
+
       VMOVRRD,      // double to two gprs.
       VMOVDRR,      // Two gprs to double.
 
@@ -206,18 +211,22 @@ namespace llvm {
       VST4_UPD,
       VST2LN_UPD,
       VST3LN_UPD,
-      VST4LN_UPD
+      VST4LN_UPD,
+
+      // 64-bit atomic ops (value split into two registers)
+      ATOMADD64_DAG,
+      ATOMSUB64_DAG,
+      ATOMOR64_DAG,
+      ATOMXOR64_DAG,
+      ATOMAND64_DAG,
+      ATOMNAND64_DAG,
+      ATOMSWAP64_DAG,
+      ATOMCMPXCHG64_DAG
     };
   }
 
   /// Define some predicates that are used for node matching.
   namespace ARM {
-    /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be
-    /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd)
-    /// instruction, returns its 8-bit integer representation. Otherwise,
-    /// returns -1.
-    int getVFPf32Imm(const APFloat &FPImm);
-    int getVFPf64Imm(const APFloat &FPImm);
     bool isBitFieldInvertedMask(unsigned v);
   }
 
@@ -240,10 +249,16 @@ namespace llvm {
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    virtual EVT getSetCCResultType(EVT VT) const;
+
     virtual MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
                                   MachineBasicBlock *MBB) const;
 
+    virtual void
+    AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const;
+
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
@@ -256,7 +271,7 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
@@ -485,12 +500,28 @@ namespace llvm {
                                         MachineBasicBlock *BB,
                                         unsigned Size,
                                         unsigned BinOpcode) const;
+    MachineBasicBlock *EmitAtomicBinary64(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned Op1,
+                                          unsigned Op2,
+                                          bool NeedsCarry = false,
+                                          bool IsCmpxchg = false) const;
     MachineBasicBlock * EmitAtomicBinaryMinMax(MachineInstr *MI,
                                                MachineBasicBlock *BB,
                                                unsigned Size,
                                                bool signExtend,
                                                ARMCC::CondCodes Cond) const;
 
+    void EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB,
+                                      MachineBasicBlock *DispatchBB) const;
+
+    void SetupEntryBlockForSjLj(MachineInstr *MI,
+                                MachineBasicBlock *MBB,
+                                MachineBasicBlock *DispatchBB, int FI) const;
+
+    MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr *MI,
+                                             MachineBasicBlock *MBB) const;
+
     bool RemapAddSubWithFlags(MachineInstr *MI, MachineBasicBlock *BB) const;
   };
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 3ccf22f..7cbc911 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -25,7 +25,7 @@ def BrFrm         : Format<2>;
 def BrMiscFrm     : Format<3>;
 
 def DPFrm         : Format<4>;
-def DPSoRegFrm    : Format<5>;
+def DPSoRegRegFrm    : Format<5>;
 
 def LdFrm         : Format<6>;
 def StFrm         : Format<7>;
@@ -68,6 +68,7 @@ def N3RegVShFrm   : Format<38>;
 def NVExtFrm      : Format<39>;
 def NVMulSLFrm    : Format<40>;
 def NVTBLFrm      : Format<41>;
+def DPSoRegImmFrm  : Format<42>;
 
 // Misc flags.
 
@@ -130,39 +131,15 @@ def VFPNeonA8Domain : Domain<5>; // Instructions in VFP & Neon under A8
 // ARM special operands.
 //
 
-def CondCodeOperand : AsmOperandClass {
-  let Name = "CondCode";
-  let SuperClasses = [];
-}
-
-def CCOutOperand : AsmOperandClass {
-  let Name = "CCOut";
-  let SuperClasses = [];
-}
-
-def MemBarrierOptOperand : AsmOperandClass {
-  let Name = "MemBarrierOpt";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMemBarrierOptOperand";
-}
-
-def ProcIFlagsOperand : AsmOperandClass {
-  let Name = "ProcIFlags";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseProcIFlagsOperand";
-}
-
-def MSRMaskOperand : AsmOperandClass {
-  let Name = "MSRMask";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMSRMaskOperand";
-}
-
 // ARM imod and iflag operands, used only by the CPS instruction.
 def imod_op : Operand<i32> {
   let PrintMethod = "printCPSIMod";
 }
 
+def ProcIFlagsOperand : AsmOperandClass {
+  let Name = "ProcIFlags";
+  let ParserMethod = "parseProcIFlagsOperand";
+}
 def iflags_op : Operand<i32> {
   let PrintMethod = "printCPSIFlag";
   let ParserMatchClass = ProcIFlagsOperand;
@@ -170,17 +147,21 @@ def iflags_op : Operand<i32> {
 
 // ARM Predicate operand. Default to 14 = always (AL). Second part is CC
 // register whose default is 0 (no register).
-def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
+def CondCodeOperand : AsmOperandClass { let Name = "CondCode"; }
+def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
                                      (ops (i32 14), (i32 zero_reg))> {
   let PrintMethod = "printPredicateOperand";
   let ParserMatchClass = CondCodeOperand;
+  let DecoderMethod = "DecodePredicateOperand";
 }
 
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
   let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
   let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
 }
 
 // Same as cc_out except it defaults to setting CPSR.
@@ -188,16 +169,27 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
   let EncoderMethod = "getCCOutOpValue";
   let PrintMethod = "printSBitModifierOperand";
   let ParserMatchClass = CCOutOperand;
+  let DecoderMethod = "DecodeCCOutOperand";
 }
 
 // ARM special operands for disassembly only.
 //
+def SetEndAsmOperand : AsmOperandClass {
+  let Name = "SetEndImm";
+  let ParserMethod = "parseSetEndImm";
+}
 def setend_op : Operand<i32> {
   let PrintMethod = "printSetendOperand";
+  let ParserMatchClass = SetEndAsmOperand;
 }
 
+def MSRMaskOperand : AsmOperandClass {
+  let Name = "MSRMask";
+  let ParserMethod = "parseMSRMaskOperand";
+}
 def msr_mask : Operand<i32> {
   let PrintMethod = "printMSRMaskOperand";
+  let DecoderMethod = "DecodeMSRMask";
   let ParserMatchClass = MSRMaskOperand;
 }
 
@@ -211,21 +203,40 @@ def msr_mask : Operand<i32> {
 //     64       64 - <imm> is encoded in imm6<5:0>
 def shr_imm8  : Operand<i32> {
   let EncoderMethod = "getShiftRight8Imm";
+  let DecoderMethod = "DecodeShiftRight8Imm";
 }
 def shr_imm16 : Operand<i32> {
   let EncoderMethod = "getShiftRight16Imm";
+  let DecoderMethod = "DecodeShiftRight16Imm";
 }
 def shr_imm32 : Operand<i32> {
   let EncoderMethod = "getShiftRight32Imm";
+  let DecoderMethod = "DecodeShiftRight32Imm";
 }
 def shr_imm64 : Operand<i32> {
   let EncoderMethod = "getShiftRight64Imm";
+  let DecoderMethod = "DecodeShiftRight64Imm";
 }
 
 //===----------------------------------------------------------------------===//
+// ARM Assembler alias templates.
+//
+class ARMInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsARM]>;
+class  tInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb]>;
+class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
+class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
+class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
+
+//===----------------------------------------------------------------------===//
 // ARM Instruction templates.
 //
 
+
 class InstTemplate<AddrMode am, int sz, IndexMode im,
                    Format f, Domain d, string cstr, InstrItinClass itin>
   : Instruction {
@@ -240,17 +251,22 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   Domain D = d;
   bit isUnaryDataProc = 0;
   bit canXformTo16Bit = 0;
+  // The instruction is a 16-bit flag setting Thumb instruction. Used
+  // by the parser to determine whether to require the 'S' suffix on the
+  // mnemonic (when not in an IT block) or preclude it (when in an IT block).
+  bit thumbArithFlagSetting = 0;
 
   // If this is a pseudo instruction, mark it isCodeGenOnly.
   let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
 
-  // The layout of TSFlags should be kept in sync with ARMBaseInstrInfo.h.
+  // The layout of TSFlags should be kept in sync with ARMBaseInfo.h.
   let TSFlags{4-0}   = AM.Value;
   let TSFlags{6-5}   = IndexModeBits;
   let TSFlags{12-7} = Form;
   let TSFlags{13}    = isUnaryDataProc;
   let TSFlags{14}    = canXformTo16Bit;
   let TSFlags{17-15} = D.Value;
+  let TSFlags{18}    = thumbArithFlagSetting;
 
   let Constraints = cstr;
   let Itinerary = itin;
@@ -262,13 +278,17 @@ class Encoding {
 
 class InstARM<AddrMode am, int sz, IndexMode im,
               Format f, Domain d, string cstr, InstrItinClass itin>
-  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding;
+  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding {
+  let DecoderNamespace = "ARM";
+}
 
 // This Encoding-less class is used by Thumb1 to specify the encoding bits later
 // on by adding flavors to specific instructions.
 class InstThumb<AddrMode am, int sz, IndexMode im,
                 Format f, Domain d, string cstr, InstrItinClass itin>
-  : InstTemplate<am, sz, im, f, d, cstr, itin>;
+  : InstTemplate<am, sz, im, f, d, cstr, itin> {
+  let DecoderNamespace = "Thumb";
+}
 
 class PseudoInst<dag oops, dag iops, InstrItinClass itin, list<dag> pattern>
   : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo,
@@ -426,11 +446,11 @@ class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
   bits<4> Rt;
-  bits<4> Rn;
+  bits<4> addr;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 1;
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
   let Inst{11-0}  = 0b111110011111;
 }
@@ -450,14 +470,14 @@ class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{3-0}   = Rt;
 }
 class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
-  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, [$Rn]", pattern> {
+  : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
   bits<4> Rt;
   bits<4> Rt2;
-  bits<4> Rn;
+  bits<4> addr;
   let Inst{27-23} = 0b00010;
   let Inst{22} = b;
   let Inst{21-20} = 0b00;
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
   let Inst{11-4} = 0b00001001;
   let Inst{3-0} = Rt2;
@@ -515,22 +535,41 @@ class AI2ldstidx<bit isLd, bit isByte, bit isPre, dag oops, dag iops,
   let Inst{20}    = isLd; // L bit
   let Inst{15-12} = Rt;
 }
-class AI2stridx<bit isByte, bit isPre, dag oops, dag iops,
+class AI2stridx_reg<bit isByte, bit isPre, dag oops, dag iops,
+                IndexMode im, Format f, InstrItinClass itin, string opc,
+                string asm, string cstr, list<dag> pattern>
+  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
+               pattern> {
+  // AM2 store w/ two operands: (GPR, am2offset)
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> Rn;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{19-16} = Rn;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+}
+
+class AI2stridx_imm<bit isByte, bit isPre, dag oops, dag iops,
                 IndexMode im, Format f, InstrItinClass itin, string opc,
                 string asm, string cstr, list<dag> pattern>
   : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
                pattern> {
   // AM2 store w/ two operands: (GPR, am2offset)
-  // {13}     1 == Rm, 0 == imm12
   // {12}     isAdd
   // {11-0}   imm12/Rm
   bits<14> offset;
   bits<4> Rn;
-  let Inst{25} = offset{13};
+  let Inst{25} = 0;
   let Inst{23} = offset{12};
   let Inst{19-16} = Rn;
   let Inst{11-0} = offset{11-0};
 }
+
+
 // FIXME: Merge with the above class when addrmode2 gets used for STR, STRB
 // but for now use this class for STRT and STRBT.
 class AI2stridxT<bit isByte, bit isPre, dag oops, dag iops,
@@ -568,9 +607,11 @@ class AI3ld<bits<4> op, bit op20, dag oops, dag iops, Format f,
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{7-4}   = op;
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+
+  let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 
-class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
+class AI3ldstidx<bits<4> op, bit op20, bit isPre, dag oops, dag iops,
                 IndexMode im, Format f, InstrItinClass itin, string opc,
                 string asm, string cstr, list<dag> pattern>
   : I<oops, iops, AddrMode3, 4, im, f, itin,
@@ -586,48 +627,24 @@ class AI3ldstidx<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
 
 // FIXME: Merge with the above class when addrmode2 gets used for LDR, LDRB
 // but for now use this class for LDRSBT, LDRHT, LDSHT.
-class AI3ldstidxT<bits<4> op, bit op20, bit isLd, bit isPre, dag oops, dag iops,
+class AI3ldstidxT<bits<4> op, bit isLoad, dag oops, dag iops,
                   IndexMode im, Format f, InstrItinClass itin, string opc,
                   string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, im, f, itin,
-      opc, asm, cstr, pattern> {
+  : I<oops, iops, AddrMode3, 4, im, f, itin, opc, asm, cstr, pattern> {
   // {13}     1 == imm8, 0 == Rm
   // {12-9}   Rn
   // {8}      isAdd
   // {7-4}    imm7_4/zero
   // {3-0}    imm3_0/Rm
-  bits<14> addr;
-  bits<4> Rt;
-  let Inst{27-25} = 0b000;
-  let Inst{24}    = isPre;        // P bit
-  let Inst{23}    = addr{8};      // U bit
-  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
-  let Inst{20}    = op20;         // L bit
-  let Inst{19-16} = addr{12-9};   // Rn
-  let Inst{15-12} = Rt;           // Rt
-  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
-  let Inst{7-4}   = op;
-  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode3";
-}
-
-class AI3stridx<bits<4> op, bit isByte, bit isPre, dag oops, dag iops,
-                IndexMode im, Format f, InstrItinClass itin, string opc,
-                string asm, string cstr, list<dag> pattern>
-  : AI2ldstidx<0, isByte, isPre, oops, iops, im, f, itin, opc, asm, cstr,
-               pattern> {
-  // AM3 store w/ two operands: (GPR, am3offset)
-  bits<14> offset;
+  bits<4> addr;
   bits<4> Rt;
-  bits<4> Rn;
   let Inst{27-25} = 0b000;
-  let Inst{23}    = offset{8};
-  let Inst{22}    = offset{9};
-  let Inst{19-16} = Rn;
+  let Inst{24}    = 0;            // P bit
+  let Inst{21}    = 1;
+  let Inst{20}    = isLoad;       // L bit
+  let Inst{19-16} = addr;         // Rn
   let Inst{15-12} = Rt;           // Rt
-  let Inst{11-8}  = offset{7-4};  // imm7_4/zero
   let Inst{7-4}   = op;
-  let Inst{3-0}   = offset{3-0};  // imm3_0/Rm
 }
 
 // stores
@@ -648,75 +665,7 @@ class AI3str<bits<4> op, dag oops, dag iops, Format f, InstrItinClass itin,
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{7-4}   = op;
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-}
-
-// Pre-indexed stores
-class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3stdpr<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePre, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 1; // W bit
-  let Inst{24}    = 1; // P bit
-  let Inst{27-25} = 0b000;
-}
-
-// Post-indexed stores
-class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
-               string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePost, f, itin,
-      opc, asm, cstr,pattern> {
-  // {13}     1 == imm8, 0 == Rm
-  // {12-9}   Rn
-  // {8}      isAdd
-  // {7-4}    imm7_4/zero
-  // {3-0}    imm3_0/Rm
-  bits<14> addr;
-  bits<4> Rt;
-  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 0; // S bit
-  let Inst{7}     = 1;
-  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
-  let Inst{15-12} = Rt;           // Rt
-  let Inst{19-16} = addr{12-9};   // Rn
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
-  let Inst{23}    = addr{8};      // U bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
-}
-class AI3stdpo<dag oops, dag iops, Format f, InstrItinClass itin,
-             string opc, string asm, string cstr, list<dag> pattern>
-  : I<oops, iops, AddrMode3, 4, IndexModePost, f, itin,
-      opc, asm, cstr, pattern> {
-  let Inst{4}     = 1;
-  let Inst{5}     = 1; // H bit
-  let Inst{6}     = 1; // S bit
-  let Inst{7}     = 1;
-  let Inst{20}    = 0; // L bit
-  let Inst{21}    = 0; // W bit
-  let Inst{24}    = 0; // P bit
-  let Inst{27-25} = 0b000;
+  let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 
 // addrmode4 instructions
@@ -843,6 +792,23 @@ class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
 }
 
 // PKH instructions
+def PKHLSLAsmOperand : AsmOperandClass {
+  let Name = "PKHLSLImm";
+  let ParserMethod = "parsePKHLSLImm";
+}
+def pkh_lsl_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 32; }]>{
+  let PrintMethod = "printPKHLSLShiftImm";
+  let ParserMatchClass = PKHLSLAsmOperand;
+}
+def PKHASRAsmOperand : AsmOperandClass {
+  let Name = "PKHASRImm";
+  let ParserMethod = "parsePKHASRImm";
+}
+def pkh_asr_amt: Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]>{
+  let PrintMethod = "printPKHASRShiftImm";
+  let ParserMatchClass = PKHASRAsmOperand;
+}
+
 class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
@@ -850,11 +816,11 @@ class APKHI<bits<8> opcod, bit tb, dag oops, dag iops, InstrItinClass itin,
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
-  bits<8> sh;
+  bits<5> sh;
   let Inst{27-20} = opcod;
   let Inst{19-16} = Rn;
   let Inst{15-12} = Rd;
-  let Inst{11-7}  = sh{7-3};
+  let Inst{11-7}  = sh;
   let Inst{6}     = tb;
   let Inst{5-4}   = 0b01;
   let Inst{3-0}   = Rm;
@@ -949,7 +915,9 @@ class Thumb1sI<dag oops, dag iops, AddrMode am, int sz,
   let InOperandList = !con(iops, (ins pred:$p));
   let AsmString = !strconcat(opc, "${s}${p}", asm);
   let Pattern = pattern;
+  let thumbArithFlagSetting = 1;
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "ThumbSBit";
 }
 
 class T1sI<dag oops, dag iops, InstrItinClass itin,
@@ -1071,6 +1039,7 @@ class Thumb2I<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 }
 
 // Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as an
@@ -1091,6 +1060,7 @@ class Thumb2sI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = !strconcat(opc, "${s}${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 }
 
 // Special cases
@@ -1103,6 +1073,7 @@ class Thumb2XI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = asm;
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 }
 
 class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
@@ -1114,6 +1085,7 @@ class ThumbXI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = asm;
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb, IsThumb1Only];
+  let DecoderNamespace = "Thumb";
 }
 
 class T2I<dag oops, dag iops, InstrItinClass itin,
@@ -1132,8 +1104,8 @@ class T2Ipc<dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeT2_pc, 4, itin, opc, asm, "", pattern>;
 class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
-              string opc, string asm, list<dag> pattern>
-  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, "",
+              string opc, string asm, string cstr, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
             pattern> {
   bits<4> Rt;
   bits<4> Rt2;
@@ -1149,6 +1121,26 @@ class T2Ii8s4<bit P, bit W, bit isLoad, dag oops, dag iops, InstrItinClass itin,
   let Inst{11-8}  = Rt2{3-0};
   let Inst{7-0}   = addr{7-0};
 }
+class T2Ii8s4post<bit P, bit W, bit isLoad, dag oops, dag iops,
+                  InstrItinClass itin, string opc, string asm, string cstr,
+                  list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, 4, itin, opc, asm, cstr,
+            pattern> {
+  bits<4> Rt;
+  bits<4> Rt2;
+  bits<4> addr;
+  bits<9> imm;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24}    = P;
+  let Inst{23}    = imm{8};
+  let Inst{22}    = 1;
+  let Inst{21}    = W;
+  let Inst{20}    = isLoad;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt{3-0};
+  let Inst{11-8}  = Rt2{3-0};
+  let Inst{7-0}   = imm{7-0};
+}
 
 class T2sI<dag oops, dag iops, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
@@ -1172,8 +1164,8 @@ class T2XIt<dag oops, dag iops, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : Thumb2XI<oops, iops, AddrModeNone, 4, itin, asm, cstr, pattern>;
 
-// T2Iidxldst - Thumb2 indexed load / store instructions.
-class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
+// T2Ipreldst - Thumb2 pre-indexed load / store instructions.
+class T2Ipreldst<bit signed, bits<2> opcod, bit load, bit pre,
                  dag oops, dag iops,
                  AddrMode am, IndexMode im, InstrItinClass itin,
                  string opc, string asm, string cstr, list<dag> pattern>
@@ -1183,25 +1175,60 @@ class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
   let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
+
+  bits<4> Rt;
+  bits<13> addr;
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24}    = signed;
   let Inst{23}    = 0;
   let Inst{22-21} = opcod;
   let Inst{20}    = load;
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt{3-0};
   let Inst{11}    = 1;
   // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
   let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = addr{8}; // Sign bit
   let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = addr{7-0};
 
-  bits<9> addr;
-  let Inst{7-0} = addr{7-0};
-  let Inst{9}   = addr{8}; // Sign bit
+  let DecoderMethod = "DecodeT2LdStPre";
+}
+
+// T2Ipostldst - Thumb2 post-indexed load / store instructions.
+class T2Ipostldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, 4, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ins pred:$p));
+  let AsmString = !strconcat(opc, "${p}", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let DecoderNamespace = "Thumb2";
 
   bits<4> Rt;
   bits<4> Rn;
+  bits<9> offset;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24}    = signed;
+  let Inst{23}    = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20}    = load;
+  let Inst{19-16} = Rn;
   let Inst{15-12} = Rt{3-0};
-  let Inst{19-16} = Rn{3-0};
+  let Inst{11}    = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10}    = pre; // The P bit.
+  let Inst{9}     = offset{8}; // Sign bit
+  let Inst{8}     = 1; // The W bit.
+  let Inst{7-0}   = offset{7-0};
+
+  let DecoderMethod = "DecodeT2LdStPre";
 }
 
 // Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
@@ -1242,6 +1269,7 @@ class VFPI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = !strconcat(opc, "${p}", asm);
   let Pattern = pattern;
   let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
   list<Predicate> Predicates = [HasVFP2];
 }
 
@@ -1257,6 +1285,7 @@ class VFPXI<dag oops, dag iops, AddrMode am, int sz,
   let AsmString = asm;
   let Pattern = pattern;
   let PostEncoderMethod = "VFPThumb2PostEncoder";
+  let DecoderNamespace = "VFP";
   list<Predicate> Predicates = [HasVFP2];
 }
 
@@ -1574,6 +1603,7 @@ class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   let AsmString = !strconcat(opc, "${p}", ".", dt, "\t", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
 }
 
 // Same as NeonI except it does not have a "data type" specifier.
@@ -1586,6 +1616,7 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   let AsmString = !strconcat(opc, "${p}", "\t", asm);
   let Pattern = pattern;
   list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
 }
 
 class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
@@ -1600,6 +1631,7 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
   let Inst{7-4}   = op7_4;
 
   let PostEncoderMethod = "NEONThumb2LoadStorePostEncoder";
+  let DecoderNamespace = "NEONLoadStore";
 
   bits<5> Vd;
   bits<6> Rn;
@@ -1643,6 +1675,7 @@ class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
           pattern> {
   let Inst{31-25} = 0b1111001;
   let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
 }
 
 class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
@@ -1651,6 +1684,7 @@ class NDataXI<dag oops, dag iops, Format f, InstrItinClass itin,
            cstr, pattern> {
   let Inst{31-25} = 0b1111001;
   let PostEncoderMethod = "NEONThumb2DataIPostEncoder";
+  let DecoderNamespace = "NEONData";
 }
 
 // NEON "one register and a modified immediate" format.
@@ -1677,6 +1711,7 @@ class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
   let Inst{24}    = SIMM{7};
   let Inst{18-16} = SIMM{6-4};
   let Inst{3-0}   = SIMM{3-0};
+  let DecoderMethod = "DecodeNEONModImmInstruction";
 }
 
 // NEON 2 vector register format.
@@ -1874,6 +1909,7 @@ class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
   list<Predicate> Predicates = [HasNEON];
 
   let PostEncoderMethod = "NEONThumb2DupPostEncoder";
+  let DecoderNamespace = "NEONDup";
 
   bits<5> V;
   bits<4> R;
@@ -1915,7 +1951,6 @@ class NVDupLane<bits<4> op19_16, bit op6, dag oops, dag iops,
 
   bits<5> Vd;
   bits<5> Vm;
-  bits<4> lane;
 
   let Inst{22}     = Vd{4};
   let Inst{15-12} = Vd{3-0};
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index adcbf18..48da03f 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -13,8 +13,8 @@
 
 #include "ARMInstrInfo.h"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -30,14 +30,18 @@ ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
 unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   switch (Opc) {
   default: break;
-  case ARM::LDR_PRE:
-  case ARM::LDR_POST:
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDR_POST_REG:
     return ARM::LDRi12;
   case ARM::LDRH_PRE:
   case ARM::LDRH_POST:
     return ARM::LDRH;
-  case ARM::LDRB_PRE:
-  case ARM::LDRB_POST:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRB_POST_REG:
     return ARM::LDRBi12;
   case ARM::LDRSH_PRE:
   case ARM::LDRSH_POST:
@@ -45,14 +49,18 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
   case ARM::LDRSB_PRE:
   case ARM::LDRSB_POST:
     return ARM::LDRSB;
-  case ARM::STR_PRE:
-  case ARM::STR_POST:
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_PRE_REG:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
     return ARM::STRi12;
   case ARM::STRH_PRE:
   case ARM::STRH_POST:
     return ARM::STRH;
-  case ARM::STRB_PRE:
-  case ARM::STRB_POST:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
     return ARM::STRBi12;
   }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index a42dd1a..2cf0f09 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -70,6 +70,18 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
+def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, CPSR = op LHS, RHS, CPSR
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
@@ -120,6 +132,12 @@ def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
 def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutGlue]>;
 def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInGlue ]>;
 
+def ARMaddc          : SDNode<"ARMISD::ADDC",  SDTBinaryArithWithFlags,
+                              [SDNPCommutative]>;
+def ARMsubc          : SDNode<"ARMISD::SUBC",  SDTBinaryArithWithFlags>;
+def ARMadde          : SDNode<"ARMISD::ADDE",  SDTBinaryArithWithFlagsInOut>;
+def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
+
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
                                SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>;
@@ -187,10 +205,16 @@ def IsThumb          : Predicate<"Subtarget->isThumb()">,
 def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
 def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
                                  AssemblerPredicate<"ModeThumb,FeatureThumb2">;
+def IsMClass         : Predicate<"Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass">;
+def IsARClass        : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"!FeatureMClass">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<"!ModeThumb">;
 def IsDarwin         : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin      : Predicate<"!Subtarget->isTargetDarwin()">;
+def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">,
+                                 AssemblerPredicate<"ModeNaCl">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
 def UseMovt          : Predicate<"Subtarget->useMovt()">;
@@ -263,24 +287,11 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_65535AsmOperand;
 }
 
+class BinOpWithFlagFrag<dag res> :
+      PatFrag<(ops node:$LHS, node:$RHS, node:$FLAG), res>;
 class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
 
-/// adde and sube predicates - True based on whether the carry flag output
-/// will be needed or not.
-def adde_dead_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS),
-  [{return !N->hasAnyUseOfValue(1);}]>;
-def sube_dead_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS),
-  [{return !N->hasAnyUseOfValue(1);}]>;
-def adde_live_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (adde node:$LHS, node:$RHS),
-  [{return N->hasAnyUseOfValue(1);}]>;
-def sube_live_carry :
-  PatFrag<(ops node:$LHS, node:$RHS), (sube node:$LHS, node:$RHS),
-  [{return N->hasAnyUseOfValue(1);}]>;
-
 // An 'and' node with a single use.
 def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
   return N->hasOneUse();
@@ -315,6 +326,7 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
 def brtarget : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeT2BROperand";
 }
 
 // FIXME: get rid of this one?
@@ -345,39 +357,35 @@ def bl_target : Operand<i32> {
   let OperandType = "OPERAND_PCREL";
 }
 
-
-// A list of registers separated by comma. Used by load/store multiple.
-def RegListAsmOperand : AsmOperandClass {
-  let Name = "RegList";
-  let SuperClasses = [];
-}
-
-def DPRRegListAsmOperand : AsmOperandClass {
-  let Name = "DPRRegList";
-  let SuperClasses = [];
-}
-
-def SPRRegListAsmOperand : AsmOperandClass {
-  let Name = "SPRRegList";
-  let SuperClasses = [];
+def blx_target : Operand<i32> {
+  // Encoded the same as branch targets.
+  let EncoderMethod = "getARMBLXTargetOpValue";
+  let OperandType = "OPERAND_PCREL";
 }
 
+// A list of registers separated by comma. Used by load/store multiple.
+def RegListAsmOperand : AsmOperandClass { let Name = "RegList"; }
 def reglist : Operand<i32> {
   let EncoderMethod = "getRegisterListOpValue";
   let ParserMatchClass = RegListAsmOperand;
   let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeRegListOperand";
 }
 
+def DPRRegListAsmOperand : AsmOperandClass { let Name = "DPRRegList"; }
 def dpr_reglist : Operand<i32> {
   let EncoderMethod = "getRegisterListOpValue";
   let ParserMatchClass = DPRRegListAsmOperand;
   let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeDPRRegListOperand";
 }
 
+def SPRRegListAsmOperand : AsmOperandClass { let Name = "SPRRegList"; }
 def spr_reglist : Operand<i32> {
   let EncoderMethod = "getRegisterListOpValue";
   let ParserMatchClass = SPRRegListAsmOperand;
   let PrintMethod = "printRegisterList";
+  let DecoderMethod = "DecodeSPRRegListOperand";
 }
 
 // An operand for the CONSTPOOL_ENTRY pseudo-instruction.
@@ -397,56 +405,99 @@ def adrlabel : Operand<i32> {
 
 def neon_vcvt_imm32 : Operand<i32> {
   let EncoderMethod = "getNEONVcvtImm32OpValue";
+  let DecoderMethod = "DecodeVCVTImmOperand";
 }
 
 // rot_imm: An integer that encodes a rotate amount. Must be 8, 16, or 24.
-def rot_imm : Operand<i32>, ImmLeaf<i32, [{
-    int32_t v = (int32_t)Imm;
-    return v == 8 || v == 16 || v == 24; }]> {
-  let EncoderMethod = "getRotImmOpValue";
+def rot_imm_XFORM: SDNodeXForm<imm, [{
+  switch (N->getZExtValue()){
+  default: assert(0);
+  case 0:  return CurDAG->getTargetConstant(0, MVT::i32);
+  case 8:  return CurDAG->getTargetConstant(1, MVT::i32);
+  case 16: return CurDAG->getTargetConstant(2, MVT::i32);
+  case 24: return CurDAG->getTargetConstant(3, MVT::i32);
+  }
+}]>;
+def RotImmAsmOperand : AsmOperandClass {
+  let Name = "RotImm";
+  let ParserMethod = "parseRotImm";
 }
-
-def ShifterAsmOperand : AsmOperandClass {
-  let Name = "Shifter";
-  let SuperClasses = [];
+def rot_imm : Operand<i32>, PatLeaf<(i32 imm), [{
+    int32_t v = N->getZExtValue();
+    return v == 8 || v == 16 || v == 24; }],
+    rot_imm_XFORM> {
+  let PrintMethod = "printRotImmOperand";
+  let ParserMatchClass = RotImmAsmOperand;
 }
 
 // shift_imm: An integer that encodes a shift amount and the type of shift
-// (currently either asr or lsl) using the same encoding used for the
-// immediates in so_reg operands.
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 encoded as imm5 == 0.
+def ShifterImmAsmOperand : AsmOperandClass {
+  let Name = "ShifterImm";
+  let ParserMethod = "parseShifterImm";
+}
 def shift_imm : Operand<i32> {
   let PrintMethod = "printShiftImmOperand";
-  let ParserMatchClass = ShifterAsmOperand;
+  let ParserMatchClass = ShifterImmAsmOperand;
 }
 
-def ShiftedRegAsmOperand : AsmOperandClass {
-  let Name = "ShiftedReg";
+// shifter_operand operands: so_reg_reg, so_reg_imm, and so_imm.
+def ShiftedRegAsmOperand : AsmOperandClass { let Name = "RegShiftedReg"; }
+def so_reg_reg : Operand<i32>,  // reg reg imm
+                 ComplexPattern<i32, 3, "SelectRegShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
+  let MIOperandInfo = (ops GPRnopc, GPRnopc, i32imm);
 }
 
-// shifter_operand operands: so_reg and so_imm.
-def so_reg : Operand<i32>,    // reg reg imm
-             ComplexPattern<i32, 3, "SelectShifterOperandReg",
-                            [shl,srl,sra,rotr]> {
-  let EncoderMethod = "getSORegOpValue";
-  let PrintMethod = "printSORegOperand";
-  let ParserMatchClass = ShiftedRegAsmOperand;
-  let MIOperandInfo = (ops GPR, GPR, shift_imm);
+def ShiftedImmAsmOperand : AsmOperandClass { let Name = "RegShiftedImm"; }
+def so_reg_imm : Operand<i32>, // reg imm
+                 ComplexPattern<i32, 2, "SelectImmShifterOperand",
+                                [shl, srl, sra, rotr]> {
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: Does this need to be distinct from so_reg?
+def shift_so_reg_reg : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 3, "SelectShiftRegShifterOperand",
+                                  [shl,srl,sra,rotr]> {
+  let EncoderMethod = "getSORegRegOpValue";
+  let PrintMethod = "printSORegRegOperand";
+  let DecoderMethod = "DecodeSORegRegOperand";
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
 }
+
 // FIXME: Does this need to be distinct from so_reg?
-def shift_so_reg : Operand<i32>,    // reg reg imm
-                   ComplexPattern<i32, 3, "SelectShiftShifterOperandReg",
+def shift_so_reg_imm : Operand<i32>,    // reg reg imm
+                   ComplexPattern<i32, 2, "SelectShiftImmShifterOperand",
                                   [shl,srl,sra,rotr]> {
-  let EncoderMethod = "getSORegOpValue";
-  let PrintMethod = "printSORegOperand";
-  let MIOperandInfo = (ops GPR, GPR, shift_imm);
+  let EncoderMethod = "getSORegImmOpValue";
+  let PrintMethod = "printSORegImmOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
 }
 
+
 // so_imm - Match a 32-bit shifter_operand immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits.
+def SOImmAsmOperand: AsmOperandClass { let Name = "ARMSOImm"; }
 def so_imm : Operand<i32>, ImmLeaf<i32, [{
     return ARM_AM::getSOImmVal(Imm) != -1;
   }]> {
   let EncoderMethod = "getSOImmOpValue";
+  let ParserMatchClass = SOImmAsmOperand;
+  let DecoderMethod = "DecodeSOImmOperand";
 }
 
 // Break so_imm's up into two pieces.  This handles immediates with up to 16
@@ -464,7 +515,7 @@ def arm_i32imm : PatLeaf<(imm), [{
   return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
 }]>;
 
-/// imm0_7 predicate - Immediate in the range [0,31].
+/// imm0_7 predicate - Immediate in the range [0,7].
 def Imm0_7AsmOperand: AsmOperandClass { let Name = "Imm0_7"; }
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 8;
@@ -472,7 +523,7 @@ def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_7AsmOperand;
 }
 
-/// imm0_15 predicate - Immediate in the range [0,31].
+/// imm0_15 predicate - Immediate in the range [0,15].
 def Imm0_15AsmOperand: AsmOperandClass { let Name = "Imm0_15"; }
 def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 16;
@@ -481,68 +532,83 @@ def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def Imm0_31AsmOperand: AsmOperandClass { let Name = "Imm0_31"; }
 def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 32;
-}]>;
-
-/// imm0_31_m1 - Matches and prints like imm0_31, but encodes as 'value - 1'.
-def imm0_31_m1 : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm < 32;
 }]> {
-  let EncoderMethod = "getImmMinusOneOpValue";
+  let ParserMatchClass = Imm0_31AsmOperand;
+}
+
+/// imm0_255 predicate - Immediate in the range [0,255].
+def Imm0_255AsmOperand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
+  let ParserMatchClass = Imm0_255AsmOperand;
 }
 
-// i32imm_hilo16 - For movt/movw - sets the MC Encoder method.
-// The imm is split into imm{15-12}, imm{11-0}
+// imm0_65535_expr - For movt/movw - 16-bit immediate that can also reference
+// a relocatable expression.
 //
-def i32imm_hilo16 : Operand<i32> {
+// FIXME: This really needs a Thumb version separate from the ARM version.
+// While the range is the same, and can thus use the same match class,
+// the encoding is different so it should have a different encoder method.
+def Imm0_65535ExprAsmOperand: AsmOperandClass { let Name = "Imm0_65535Expr"; }
+def imm0_65535_expr : Operand<i32> {
   let EncoderMethod = "getHiLo16ImmOpValue";
+  let ParserMatchClass = Imm0_65535ExprAsmOperand;
 }
 
+/// imm24b - True if the 32-bit immediate is encodable in 24 bits.
+def Imm24bitAsmOperand: AsmOperandClass { let Name = "Imm24bit"; }
+def imm24b : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 0 && Imm <= 0xffffff;
+}]> {
+  let ParserMatchClass = Imm24bitAsmOperand;
+}
+
+
 /// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
 /// e.g., 0xf000ffff
+def BitfieldAsmOperand : AsmOperandClass {
+  let Name = "Bitfield";
+  let ParserMethod = "parseBitfield";
+}
 def bf_inv_mask_imm : Operand<i32>,
                       PatLeaf<(imm), [{
   return ARM::isBitFieldInvertedMask(N->getZExtValue());
 }] > {
   let EncoderMethod = "getBitfieldInvertedMaskOpValue";
   let PrintMethod = "printBitfieldInvMaskImmOperand";
+  let DecoderMethod = "DecodeBitfieldMaskOperand";
+  let ParserMatchClass = BitfieldAsmOperand;
 }
 
-/// lsb_pos_imm - position of the lsb bit, used by BFI4p and t2BFI4p
-def lsb_pos_imm : Operand<i32>, ImmLeaf<i32, [{
-  return isInt<5>(Imm);
+def imm1_32_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
 }]>;
-
-/// width_imm - number of bits to be copied, used by BFI4p and t2BFI4p
-def width_imm : Operand<i32>, ImmLeaf<i32, [{
-  return Imm > 0 &&  Imm <= 32;
-}] > {
-  let EncoderMethod = "getMsbOpValue";
-}
-
-def ssat_imm : Operand<i32>, ImmLeaf<i32, [{
-  return Imm > 0 && Imm <= 32;
-}]> {
-  let EncoderMethod = "getSsatBitPosValue";
+def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
+   uint64_t Imm = N->getZExtValue();
+   return Imm > 0 && Imm <= 32;
+ }],
+    imm1_32_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_32AsmOperand;
+}
+
+def imm1_16_XFORM: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, MVT::i32);
+}]>;
+def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
+    imm1_16_XFORM> {
+  let PrintMethod = "printImmPlusOneOperand";
+  let ParserMatchClass = Imm1_16AsmOperand;
 }
 
 // Define ARM specific addressing modes.
-
-def MemMode2AsmOperand : AsmOperandClass {
-  let Name = "MemMode2";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMemMode2Operand";
-}
-
-def MemMode3AsmOperand : AsmOperandClass {
-  let Name = "MemMode3";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseMemMode3Operand";
-}
-
 // addrmode_imm12 := reg +/- imm12
 //
+def MemImm12OffsetAsmOperand : AsmOperandClass { let Name = "MemImm12Offset"; }
 def addrmode_imm12 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectAddrModeImm12", []> {
   // 12-bit immediate operand. Note that instructions using this encode
@@ -551,53 +617,129 @@ def addrmode_imm12 : Operand<i32>,
 
   let EncoderMethod = "getAddrModeImm12OpValue";
   let PrintMethod = "printAddrModeImm12Operand";
+  let DecoderMethod = "DecodeAddrModeImm12Operand";
+  let ParserMatchClass = MemImm12OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 // ldst_so_reg := reg +/- reg shop imm
 //
+def MemRegOffsetAsmOperand : AsmOperandClass { let Name = "MemRegOffset"; }
 def ldst_so_reg : Operand<i32>,
                   ComplexPattern<i32, 3, "SelectLdStSOReg", []> {
   let EncoderMethod = "getLdStSORegOpValue";
   // FIXME: Simplify the printer
   let PrintMethod = "printAddrMode2Operand";
-  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+  let DecoderMethod = "DecodeSORegMemOperand";
+  let ParserMatchClass = MemRegOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, GPRnopc:$offsreg, i32imm:$shift);
+}
+
+// postidx_imm8 := +/- [0,255]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value.
+def PostIdxImm8AsmOperand : AsmOperandClass { let Name = "PostIdxImm8"; }
+def postidx_imm8 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8Operand";
+  let ParserMatchClass = PostIdxImm8AsmOperand;
+  let MIOperandInfo = (ops i32imm);
 }
 
+// postidx_imm8s4 := +/- [0,1020]
+//
+// 9 bit value:
+//  {8}       1 is imm8 is non-negative. 0 otherwise.
+//  {7-0}     [0,255] imm8 value, scaled by 4.
+def PostIdxImm8s4AsmOperand : AsmOperandClass { let Name = "PostIdxImm8s4"; }
+def postidx_imm8s4 : Operand<i32> {
+  let PrintMethod = "printPostIdxImm8s4Operand";
+  let ParserMatchClass = PostIdxImm8s4AsmOperand;
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// postidx_reg := +/- reg
+//
+def PostIdxRegAsmOperand : AsmOperandClass {
+  let Name = "PostIdxReg";
+  let ParserMethod = "parsePostIdxReg";
+}
+def postidx_reg : Operand<i32> {
+  let EncoderMethod = "getPostIdxRegOpValue";
+  let DecoderMethod = "DecodePostIdxReg";
+  let PrintMethod = "printPostIdxRegOperand";
+  let ParserMatchClass = PostIdxRegAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+
 // addrmode2 := reg +/- imm12
 //           := reg +/- reg shop imm
 //
+// FIXME: addrmode2 should be refactored the rest of the way to always
+// use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
+def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
 def addrmode2 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode2", []> {
   let EncoderMethod = "getAddrMode2OpValue";
   let PrintMethod = "printAddrMode2Operand";
-  let ParserMatchClass = MemMode2AsmOperand;
+  let ParserMatchClass = AddrMode2AsmOperand;
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
-def am2offset : Operand<i32>,
-                ComplexPattern<i32, 2, "SelectAddrMode2Offset",
+def PostIdxRegShiftedAsmOperand : AsmOperandClass {
+  let Name = "PostIdxRegShifted";
+  let ParserMethod = "parsePostIdxReg";
+}
+def am2offset_reg : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetReg",
+                [], [SDNPWantRoot]> {
+  let EncoderMethod = "getAddrMode2OffsetOpValue";
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  // When using this for assembly, it's always as a post-index offset.
+  let ParserMatchClass = PostIdxRegShiftedAsmOperand;
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// FIXME: am2offset_imm should only need the immediate, not the GPR. Having
+// the GPR is purely vestigal at this point.
+def AM2OffsetImmAsmOperand : AsmOperandClass { let Name = "AM2OffsetImm"; }
+def am2offset_imm : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2OffsetImm",
                 [], [SDNPWantRoot]> {
   let EncoderMethod = "getAddrMode2OffsetOpValue";
   let PrintMethod = "printAddrMode2OffsetOperand";
+  let ParserMatchClass = AM2OffsetImmAsmOperand;
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
+
 // addrmode3 := reg +/- reg
 // addrmode3 := reg +/- imm8
 //
+// FIXME: split into imm vs. reg versions.
+def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
 def addrmode3 : Operand<i32>,
                 ComplexPattern<i32, 3, "SelectAddrMode3", []> {
   let EncoderMethod = "getAddrMode3OpValue";
   let PrintMethod = "printAddrMode3Operand";
-  let ParserMatchClass = MemMode3AsmOperand;
+  let ParserMatchClass = AddrMode3AsmOperand;
   let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
 }
 
+// FIXME: split into imm vs. reg versions.
+// FIXME: parser method to handle +/- register.
+def AM3OffsetAsmOperand : AsmOperandClass {
+  let Name = "AM3Offset";
+  let ParserMethod = "parseAM3Offset";
+}
 def am3offset : Operand<i32>,
                 ComplexPattern<i32, 2, "SelectAddrMode3Offset",
                                [], [SDNPWantRoot]> {
   let EncoderMethod = "getAddrMode3OffsetOpValue";
   let PrintMethod = "printAddrMode3OffsetOperand";
+  let ParserMatchClass = AM3OffsetAsmOperand;
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
@@ -608,28 +750,28 @@ def ldstm_mode : OptionalDefOperand<OtherVT, (ops i32), (ops (i32 1))> {
   let PrintMethod = "printLdStmModeOperand";
 }
 
-def MemMode5AsmOperand : AsmOperandClass {
-  let Name = "MemMode5";
-  let SuperClasses = [];
-}
-
 // addrmode5 := reg +/- imm8*4
 //
+def AddrMode5AsmOperand : AsmOperandClass { let Name = "AddrMode5"; }
 def addrmode5 : Operand<i32>,
                 ComplexPattern<i32, 2, "SelectAddrMode5", []> {
   let PrintMethod = "printAddrMode5Operand";
-  let MIOperandInfo = (ops GPR:$base, i32imm);
-  let ParserMatchClass = MemMode5AsmOperand;
   let EncoderMethod = "getAddrMode5OpValue";
+  let DecoderMethod = "DecodeAddrMode5Operand";
+  let ParserMatchClass = AddrMode5AsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm);
 }
 
 // addrmode6 := reg with optional alignment
 //
+def AddrMode6AsmOperand : AsmOperandClass { let Name = "AlignedMemory"; }
 def addrmode6 : Operand<i32>,
                 ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
   let PrintMethod = "printAddrMode6Operand";
-  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
   let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+  let ParserMatchClass = AddrMode6AsmOperand;
 }
 
 def am6offset : Operand<i32>,
@@ -638,6 +780,7 @@ def am6offset : Operand<i32>,
   let PrintMethod = "printAddrMode6OffsetOperand";
   let MIOperandInfo = (ops GPR);
   let EncoderMethod = "getAddrMode6OffsetOpValue";
+  let DecoderMethod = "DecodeGPRRegisterClass";
 }
 
 // Special version of addrmode6 to handle alignment encoding for VST1/VLD1
@@ -666,19 +809,15 @@ def addrmodepc : Operand<i32>,
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
-def MemMode7AsmOperand : AsmOperandClass {
-  let Name = "MemMode7";
-  let SuperClasses = [];
-}
-
-// addrmode7 := reg
-// Used by load/store exclusive instructions. Useful to enable right assembly
-// parsing and printing. Not used for any codegen matching.
+// addr_offset_none := reg
 //
-def addrmode7 : Operand<i32> {
+def MemNoOffsetAsmOperand : AsmOperandClass { let Name = "MemNoOffset"; }
+def addr_offset_none : Operand<i32>,
+                       ComplexPattern<i32, 1, "SelectAddrOffsetNone", []> {
   let PrintMethod = "printAddrMode7Operand";
-  let MIOperandInfo = (ops GPR);
-  let ParserMatchClass = MemMode7AsmOperand;
+  let DecoderMethod = "DecodeAddrMode7Operand";
+  let ParserMatchClass = MemNoOffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base);
 }
 
 def nohash_imm : Operand<i32> {
@@ -687,25 +826,30 @@ def nohash_imm : Operand<i32> {
 
 def CoprocNumAsmOperand : AsmOperandClass {
   let Name = "CoprocNum";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseCoprocNumOperand";
-}
-
-def CoprocRegAsmOperand : AsmOperandClass {
-  let Name = "CoprocReg";
-  let SuperClasses = [];
-  let ParserMethod = "tryParseCoprocRegOperand";
+  let ParserMethod = "parseCoprocNumOperand";
 }
-
 def p_imm : Operand<i32> {
   let PrintMethod = "printPImmediate";
   let ParserMatchClass = CoprocNumAsmOperand;
+  let DecoderMethod = "DecodeCoprocessor";
 }
 
+def CoprocRegAsmOperand : AsmOperandClass {
+  let Name = "CoprocReg";
+  let ParserMethod = "parseCoprocRegOperand";
+}
 def c_imm : Operand<i32> {
   let PrintMethod = "printCImmediate";
   let ParserMatchClass = CoprocRegAsmOperand;
 }
+def CoprocOptionAsmOperand : AsmOperandClass {
+  let Name = "CoprocOption";
+  let ParserMethod = "parseCoprocOptionOperand";
+}
+def coproc_option_imm : Operand<i32> {
+  let PrintMethod = "printCoprocOptionImm";
+  let ParserMatchClass = CoprocOptionAsmOperand;
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -748,16 +892,37 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
     let Inst{11-4} = 0b00000000;
     let Inst{3-0} = Rm;
   }
-  def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm,
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_imm:$shift))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
                iis, opc, "\t$Rd, $Rn, $shift",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> {
+               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg_reg:$shift))]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-0} = shift;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
   }
 
   // Assembly aliases for optional destination operand when it's the same
@@ -773,56 +938,172 @@ multiclass AsI1_bin_irs<bits<4> opcod, string opc,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
   def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg:$shift, pred:$p,
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+
 }
 
-/// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
-/// instruction modifies the CPSR register.
-let isCodeGenOnly = 1, Defs = [CPSR] in {
-multiclass AI1_bin_s_irs<bits<4> opcod, string opc,
+/// AsI1_rbin_irs - Same as AsI1_bin_irs except the order of operands are
+/// reversed.  The 'rr' form is only defined for the disassembler; for codegen
+/// it is equivalent to the AsI1_bin_irs counterpart.
+multiclass AsI1_rbin_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                         PatFrag opnode, bit Commutable = 0> {
-  def ri : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+                        PatFrag opnode, string baseOpc, bit Commutable = 0> {
+  // The register-immediate version is re-materializable. This is useful
+  // in particular for taking the address of a local.
+  let isReMaterializable = 1 in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
                iii, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]> {
+               [(set GPR:$Rd, (opnode so_imm:$imm, GPR:$Rn))]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> imm;
     let Inst{25} = 1;
-    let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
     let Inst{11-0} = imm;
   }
-  def rr : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
                iir, opc, "\t$Rd, $Rn, $Rm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
+               [/* pattern left blank */]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<4> Rm;
-    let isCommutable = Commutable;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, (opnode so_reg_imm:$shift, GPR:$Rn))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
     let Inst{25} = 0;
-    let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-4} = 0b00000000;
-    let Inst{3-0} = Rm;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
   }
-  def rs : AI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm,
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
                iis, opc, "\t$Rd, $Rn, $shift",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]> {
+               [(set GPR:$Rd, (opnode so_reg_reg:$shift, GPR:$Rn))]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
-    let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-0} = shift;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
   }
+
+  // Assembly aliases for optional destination operand when it's the same
+  // as the source operand.
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_imm:$imm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
+                                                    GPR:$Rm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+
+}
+
+/// AsI1_rbin_s_is - Same as AsI1_rbin_s_is except it sets 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
+multiclass AsI1_rbin_s_is<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                        PatFrag opnode, bit Commutable = 0> {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn))]>;
+
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [/* pattern left blank */]>;
+
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn))]>;
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn))]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
+}
+}
+
+/// AsI1_bin_s_irs - Same as AsI1_bin_irs except it sets the 's' bit by default.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
+multiclass AsI1_bin_s_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                         PatFrag opnode, bit Commutable = 0> {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
+               iii, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm))]>;
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
+               iir, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm))]>;
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift))]>;
+
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm,
+               iis, opc, "\t$Rd, $Rn, $shift",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift))]>;
 }
 }
 
@@ -857,128 +1138,190 @@ multiclass AI1_cmp_irs<bits<4> opcod, string opc,
     let Inst{11-4} = 0b00000000;
     let Inst{3-0} = Rm;
   }
-  def rs : AI1<opcod, (outs), (ins GPR:$Rn, so_reg:$shift), DPSoRegFrm, iis,
+  def rsi : AI1<opcod, (outs),
+               (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, iis,
+               opc, "\t$Rn, $shift",
+               [(opnode GPR:$Rn, so_reg_imm:$shift)]> {
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
+    let Inst{20} = 1;
+    let Inst{19-16} = Rn;
+    let Inst{15-12} = 0b0000;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AI1<opcod, (outs),
+               (ins GPR:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, iis,
                opc, "\t$Rn, $shift",
-               [(opnode GPR:$Rn, so_reg:$shift)]> {
+               [(opnode GPR:$Rn, so_reg_reg:$shift)]> {
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
     let Inst{20} = 1;
     let Inst{19-16} = Rn;
     let Inst{15-12} = 0b0000;
-    let Inst{11-0} = shift;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
   }
+
 }
 }
 
 /// AI_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
 /// FIXME: Remove the 'r' variant. Its rot_imm is zero.
-multiclass AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode> {
-  def r     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm",
-                 [(set GPR:$Rd, (opnode GPR:$Rm))]>,
-              Requires<[IsARM, HasV6]> {
-    bits<4> Rd;
-    bits<4> Rm;
-    let Inst{19-16} = 0b1111;
-    let Inst{15-12} = Rd;
-    let Inst{11-10} = 0b00;
-    let Inst{3-0}   = Rm;
-  }
-  def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot",
-                 [(set GPR:$Rd, (opnode (rotr GPR:$Rm, rot_imm:$rot)))]>,
-              Requires<[IsARM, HasV6]> {
-    bits<4> Rd;
-    bits<4> Rm;
-    bits<2> rot;
-    let Inst{19-16} = 0b1111;
-    let Inst{15-12} = Rd;
-    let Inst{11-10} = rot;
-    let Inst{3-0}   = Rm;
-  }
+class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+       Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{3-0}   = Rm;
 }
 
-multiclass AI_ext_rrot_np<bits<8> opcod, string opc> {
-  def r     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm",
-                 [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV6]> {
-    let Inst{19-16} = 0b1111;
-    let Inst{11-10} = 0b00;
-  }
-  def r_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rm, rot_imm:$rot),
-                 IIC_iEXTr, opc, "\t$Rd, $Rm, ror $rot",
-                 [/* For disassembly only; pattern left blank */]>,
-              Requires<[IsARM, HasV6]> {
-    bits<2> rot;
-    let Inst{19-16} = 0b1111;
-    let Inst{11-10} = rot;
-  }
+class AI_ext_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]> {
+  bits<2> rot;
+  let Inst{19-16} = 0b1111;
+  let Inst{11-10} = rot;
 }
 
 /// AI_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode> {
-  def rr     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm",
-                  [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
-               Requires<[IsARM, HasV6]> {
+class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
+          [(set GPRnopc:$Rd, (opnode GPR:$Rn,
+                                     (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
+        Requires<[IsARM, HasV6]> {
+  bits<4> Rd;
+  bits<4> Rm;
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = rot;
+  let Inst{9-4}   = 0b000111;
+  let Inst{3-0}   = Rm;
+}
+
+class AI_exta_rrot_np<bits<8> opcod, string opc>
+  : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
+          IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
+       Requires<[IsARM, HasV6]> {
+  bits<4> Rn;
+  bits<2> rot;
+  let Inst{19-16} = Rn;
+  let Inst{11-10} = rot;
+}
+
+/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             string baseOpc, bit Commutable = 0> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
+  def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
+                DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_imm:$imm, CPSR))]>,
+               Requires<[IsARM]> {
     bits<4> Rd;
-    bits<4> Rm;
     bits<4> Rn;
-    let Inst{19-16} = Rn;
+    bits<12> imm;
+    let Inst{25} = 1;
     let Inst{15-12} = Rd;
-    let Inst{11-10} = 0b00;
-    let Inst{9-4}   = 0b000111;
-    let Inst{3-0}   = Rm;
-  }
-  def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
-                                             rot_imm:$rot),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
-                  [(set GPR:$Rd, (opnode GPR:$Rn,
-                                          (rotr GPR:$Rm, rot_imm:$rot)))]>,
-                  Requires<[IsARM, HasV6]> {
+    let Inst{19-16} = Rn;
+    let Inst{11-0} = imm;
+  }
+  def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
+                DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
+               [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, GPR:$Rm, CPSR))]>,
+               Requires<[IsARM]> {
     bits<4> Rd;
+    bits<4> Rn;
     bits<4> Rm;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+    let Inst{3-0} = Rm;
+    let Inst{15-12} = Rd;
+    let Inst{19-16} = Rn;
+  }
+  def rsi : AsI1<opcod, (outs GPR:$Rd),
+                (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_imm:$shift, CPSR))]>,
+               Requires<[IsARM]> {
+    bits<4> Rd;
     bits<4> Rn;
-    bits<2> rot;
+    bits<12> shift;
+    let Inst{25} = 0;
     let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
-    let Inst{11-10} = rot;
-    let Inst{9-4}   = 0b000111;
-    let Inst{3-0}   = Rm;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
   }
-}
-
-// For disassembly only.
-multiclass AI_exta_rrot_np<bits<8> opcod, string opc> {
-  def rr     : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm",
-                  [/* For disassembly only; pattern left blank */]>,
-               Requires<[IsARM, HasV6]> {
-    let Inst{11-10} = 0b00;
-  }
-  def rr_rot : AExtI<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm,
-                                             rot_imm:$rot),
-                  IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
-                  [/* For disassembly only; pattern left blank */]>,
-                  Requires<[IsARM, HasV6]> {
+  def rsr : AsI1<opcod, (outs GPR:$Rd),
+                (ins GPR:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode GPR:$Rn, so_reg_reg:$shift, CPSR))]>,
+               Requires<[IsARM]> {
+    bits<4> Rd;
     bits<4> Rn;
-    bits<2> rot;
+    bits<12> shift;
+    let Inst{25} = 0;
     let Inst{19-16} = Rn;
-    let Inst{11-10} = rot;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
+  }
   }
+
+  // Assembly aliases for optional destination operand when it's the same
+  // as the source operand.
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_imm:$imm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) GPR:$Rdn, GPR:$Rdn,
+                                                    GPR:$Rm, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
 }
 
-/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
-multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
-                             string baseOpc, bit Commutable = 0> {
-  let Uses = [CPSR] in {
+/// AI1_rsc_irs - Define instructions and patterns for rsc
+multiclass AI1_rsc_irs<bits<4> opcod, string opc, PatFrag opnode,
+                       string baseOpc> {
+  let hasPostISelHook = 1, Defs = [CPSR], Uses = [CPSR] in {
   def ri : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
                 DPFrm, IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>,
+               [(set GPR:$Rd, CPSR, (opnode so_imm:$imm, GPR:$Rn, CPSR))]>,
                Requires<[IsARM]> {
     bits<4> Rd;
     bits<4> Rn;
@@ -990,31 +1333,48 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
   }
   def rr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 DPFrm, IIC_iALUr, opc, "\t$Rd, $Rn, $Rm",
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]>,
-               Requires<[IsARM]> {
+               [/* pattern left blank */]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<4> Rm;
     let Inst{11-4} = 0b00000000;
     let Inst{25} = 0;
-    let isCommutable = Commutable;
     let Inst{3-0} = Rm;
     let Inst{15-12} = Rd;
     let Inst{19-16} = Rn;
   }
-  def rs : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                DPSoRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>,
+  def rsi : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_imm:$shift),
+                DPSoRegImmFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_imm:$shift, GPR:$Rn, CPSR))]>,
                Requires<[IsARM]> {
     bits<4> Rd;
     bits<4> Rn;
     bits<12> shift;
     let Inst{25} = 0;
-    let Inst{11-0} = shift;
+    let Inst{19-16} = Rn;
     let Inst{15-12} = Rd;
+    let Inst{11-5} = shift{11-5};
+    let Inst{4} = 0;
+    let Inst{3-0} = shift{3-0};
+  }
+  def rsr : AsI1<opcod, (outs GPR:$Rd), (ins GPR:$Rn, so_reg_reg:$shift),
+                DPSoRegRegFrm, IIC_iALUsr, opc, "\t$Rd, $Rn, $shift",
+              [(set GPR:$Rd, CPSR, (opnode so_reg_reg:$shift, GPR:$Rn, CPSR))]>,
+               Requires<[IsARM]> {
+    bits<4> Rd;
+    bits<4> Rn;
+    bits<12> shift;
+    let Inst{25} = 0;
     let Inst{19-16} = Rn;
+    let Inst{15-12} = Rd;
+    let Inst{11-8} = shift{11-8};
+    let Inst{7} = 0;
+    let Inst{6-5} = shift{6-5};
+    let Inst{4} = 1;
+    let Inst{3-0} = shift{3-0};
   }
   }
+
   // Assembly aliases for optional destination operand when it's the same
   // as the source operand.
   def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
@@ -1028,28 +1388,15 @@ multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
   def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
-     (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPR:$Rdn, GPR:$Rdn,
-                                                    so_reg:$shift, pred:$p,
+     (!cast<Instruction>(!strconcat(baseOpc, "rsi")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_imm:$shift, pred:$p,
+                                                    cc_out:$s)>,
+     Requires<[IsARM]>;
+  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rsr")) GPR:$Rdn, GPR:$Rdn,
+                                                    so_reg_reg:$shift, pred:$p,
                                                     cc_out:$s)>,
      Requires<[IsARM]>;
-}
-
-// Carry setting variants
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1 in {
-multiclass AI1_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
-  def ri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-               4, IIC_iALUi,
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_imm:$imm))]>;
-  def rr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-               4, IIC_iALUr,
-               [(set GPR:$Rd, (opnode GPR:$Rn, GPR:$Rm))]> {
-    let isCommutable = Commutable;
-  }
-  def rs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-               4, IIC_iALUsr,
-               [(set GPR:$Rd, (opnode GPR:$Rn, so_reg:$shift))]>;
-}
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
@@ -1082,6 +1429,37 @@ multiclass AI_ldr1<bit isByte, string opc, InstrItinClass iii,
 }
 }
 
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+multiclass AI_ldr1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12: AI2ldst<0b010, 1, isByte, (outs GPRnopc:$Rt), (ins addrmode_imm12:$addr),
+                   AddrMode_i12, LdFrm, iii, opc, "\t$Rt, $addr",
+                  [(set GPRnopc:$Rt, (opnode addrmode_imm12:$addr))]> {
+    bits<4>  Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 1, isByte, (outs GPRnopc:$Rt), (ins ldst_so_reg:$shift),
+                  AddrModeNone, LdFrm, iir, opc, "\t$Rt, $shift",
+                 [(set GPRnopc:$Rt, (opnode ldst_so_reg:$shift))]> {
+    bits<4>  Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+}
+
+
 multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
            InstrItinClass iir, PatFrag opnode> {
   // Note: We use the complex addrmode_imm12 rather than just an input
@@ -1110,6 +1488,37 @@ multiclass AI_str1<bit isByte, string opc, InstrItinClass iii,
     let Inst{11-0}  = shift{11-0};
   }
 }
+
+multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
+           InstrItinClass iir, PatFrag opnode> {
+  // Note: We use the complex addrmode_imm12 rather than just an input
+  // GPR and a constrained immediate so that we can use this to match
+  // frame index references and avoid matching constant pool references.
+  def i12 : AI2ldst<0b010, 0, isByte, (outs),
+                   (ins GPRnopc:$Rt, addrmode_imm12:$addr),
+                   AddrMode_i12, StFrm, iii, opc, "\t$Rt, $addr",
+                  [(opnode GPRnopc:$Rt, addrmode_imm12:$addr)]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = addr{11-0};   // imm12
+  }
+  def rs : AI2ldst<0b011, 0, isByte, (outs), (ins GPRnopc:$Rt, ldst_so_reg:$shift),
+                  AddrModeNone, StFrm, iir, opc, "\t$Rt, $shift",
+                 [(opnode GPRnopc:$Rt, ldst_so_reg:$shift)]> {
+    bits<4> Rt;
+    bits<17> shift;
+    let shift{4}    = 0;            // Inst{4} = 0
+    let Inst{23}    = shift{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = shift{16-13}; // Rn
+    let Inst{15-12} = Rt;
+    let Inst{11-0}  = shift{11-0};
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -1140,42 +1549,66 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
            [(ARMcallseq_start timm:$amt)]>;
 }
 
-def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "",
-             [/* For disassembly only; pattern left blank */]>,
+// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops.
+// (These psuedos use a hand-written selection code).
+let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in {
+def ATOMOR6432   : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMXOR6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMADD6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMSUB6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMAND6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
+                              NoItinerary, []>;
+def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
+                                 (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
+                                      GPR:$set1, GPR:$set2),
+                                 NoItinerary, []>;
+}
+
+def NOP : AI<(outs), (ins), MiscFrm, NoItinerary, "nop", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000000;
 }
 
-def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "",
-             [/* For disassembly only; pattern left blank */]>,
+def YIELD : AI<(outs), (ins), MiscFrm, NoItinerary, "yield", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000001;
 }
 
-def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "",
-             [/* For disassembly only; pattern left blank */]>,
+def WFE : AI<(outs), (ins), MiscFrm, NoItinerary, "wfe", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000010;
 }
 
-def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "",
-             [/* For disassembly only; pattern left blank */]>,
+def WFI : AI<(outs), (ins), MiscFrm, NoItinerary, "wfi", "", []>,
           Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000011;
 }
 
-def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel",
-             "\t$dst, $a, $b",
-             [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsARM, HasV6]> {
+def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
+             "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -1188,8 +1621,7 @@ def SEL : AI<(outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm, NoItinerary, "sel",
 }
 
 def SEV : AI<(outs), (ins), MiscFrm, NoItinerary, "sev", "",
-             [/* For disassembly only; pattern left blank */]>,
-          Requires<[IsARM, HasV6T2]> {
+             []>, Requires<[IsARM, HasV6T2]> {
   let Inst{27-16} = 0b001100100000;
   let Inst{15-8} = 0b11110000;
   let Inst{7-0} = 0b00000100;
@@ -1206,14 +1638,11 @@ def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
   let Inst{7-4} = 0b0111;
 }
 
-// Change Processor State is a system instruction -- for disassembly and
-// parsing only.
-// FIXME: Since the asm parser has currently no clean way to handle optional
-// operands, create 3 versions of the same instruction. Once there's a clean
-// framework to represent optional operands, change this behavior.
+// Change Processor State
+// FIXME: We should use InstAlias to handle the optional operands.
 class CPS<dag iops, string asm_ops>
   : AXI<(outs), iops, MiscFrm, NoItinerary, !strconcat("cps", asm_ops),
-        [/* For disassembly only; pattern left blank */]>, Requires<[IsARM]> {
+        []>, Requires<[IsARM]> {
   bits<2> imod;
   bits<3> iflags;
   bits<5> mode;
@@ -1229,17 +1658,18 @@ class CPS<dag iops, string asm_ops>
   let Inst{4-0}   = mode;
 }
 
+let DecoderMethod = "DecodeCPSInstruction" in {
 let M = 1 in
-  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
+  def CPS3p : CPS<(ins imod_op:$imod, iflags_op:$iflags, imm0_31:$mode),
                   "$imod\t$iflags, $mode">;
 let mode = 0, M = 0 in
   def CPS2p : CPS<(ins imod_op:$imod, iflags_op:$iflags), "$imod\t$iflags">;
 
 let imod = 0, iflags = 0, M = 1 in
-  def CPS1p : CPS<(ins i32imm:$mode), "\t$mode">;
+  def CPS1p : CPS<(ins imm0_31:$mode), "\t$mode">;
+}
 
 // Preload signals the memory system of possible future data/instruction access.
-// These are for disassembly only.
 multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
   def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload,
@@ -1271,6 +1701,7 @@ multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
     let Inst{19-16} = shift{16-13}; // Rn
     let Inst{15-12} = 0b1111;
     let Inst{11-0}  = shift{11-0};
+    let Inst{4} = 0;
   }
 }
 
@@ -1278,10 +1709,8 @@ defm PLD  : APreLoad<1, 1, "pld">,  Requires<[IsARM]>;
 defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
 defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
 
-def SETEND : AXI<(outs),(ins setend_op:$end), MiscFrm, NoItinerary,
-                 "setend\t$end",
-                 [/* For disassembly only; pattern left blank */]>,
-               Requires<[IsARM]> {
+def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
+                 "setend\t$end", []>, Requires<[IsARM]> {
   bits<1> end;
   let Inst{31-10} = 0b1111000100000001000000;
   let Inst{9} = end;
@@ -1351,14 +1780,17 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in
 // the instruction. The {24-21} opcode bits are set by the fixup, as we don't
 // know until then which form of the instruction will be used.
 def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
-                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, #$label", []> {
+                 MiscFrm, IIC_iALUi, "adr", "\t$Rd, $label", []> {
   bits<4> Rd;
-  bits<12> label;
+  bits<14> label;
   let Inst{27-25} = 0b001;
+  let Inst{24} = 0;
+  let Inst{23-22} = label{13-12};
+  let Inst{21} = 0;
   let Inst{20} = 0;
   let Inst{19-16} = 0b1111;
   let Inst{15-12} = Rd;
-  let Inst{11-0} = label;
+  let Inst{11-0} = label{11-0};
 }
 def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
                     4, IIC_iALUi, []>;
@@ -1424,6 +1856,7 @@ let isCall = 1,
     let Inst{31-28} = 0b1110;
     bits<24> func;
     let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
   def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func, variable_ops),
@@ -1432,6 +1865,7 @@ let isCall = 1,
                 Requires<[IsARM, IsNotDarwin]> {
     bits<24> func;
     let Inst{23-0} = func;
+    let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
   // ARMv5T and above
@@ -1516,6 +1950,7 @@ let isBranch = 1, isTerminator = 1 in {
                [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> {
     bits<24> target;
     let Inst{23-0} = target;
+    let DecoderMethod = "DecodeBranchImmInstruction";
   }
 
   let isBarrier = 1 in {
@@ -1549,9 +1984,9 @@ let isBranch = 1, isTerminator = 1 in {
 
 }
 
-// BLX (immediate) -- for disassembly only
-def BLXi : AXI<(outs), (ins br_target:$target), BrMiscFrm, NoItinerary,
-               "blx\t$target", [/* pattern left blank */]>,
+// BLX (immediate)
+def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
+               "blx\t$target", []>,
            Requires<[IsARM, HasV5T]> {
   let Inst{31-25} = 0b1111101;
   bits<25> target;
@@ -1614,64 +2049,100 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   }
 }
 
-
-
-
-
-// Secure Monitor Call is a system instruction -- for disassembly only
-def SMC : ABI<0b0001, (outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
-              [/* For disassembly only; pattern left blank */]> {
+// Secure Monitor Call is a system instruction.
+def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
+              []> {
   bits<4> opt;
   let Inst{23-4} = 0b01100000000000000111;
   let Inst{3-0} = opt;
 }
 
-// Supervisor Call (Software Interrupt) -- for disassembly only
+// Supervisor Call (Software Interrupt)
 let isCall = 1, Uses = [SP] in {
-def SVC : ABI<0b1111, (outs), (ins i32imm:$svc), IIC_Br, "svc", "\t$svc",
-              [/* For disassembly only; pattern left blank */]> {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []> {
   bits<24> svc;
   let Inst{23-0} = svc;
 }
 }
 
-// Store Return State is a system instruction -- for disassembly only
-let isCodeGenOnly = 1 in {  // FIXME: This should not use submode!
-def SRSW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
-                NoItinerary, "srs${amode}\tsp!, $mode",
-                [/* For disassembly only; pattern left blank */]> {
+// Store Return State
+class SRSI<bit wb, string asm>
+  : XI<(outs), (ins imm0_31:$mode), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<5> mode;
   let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b110; // W = 1
-  let Inst{19-8} = 0xd05;
-  let Inst{7-5} = 0b000;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 1;
+  let Inst{21} = wb;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1101;  // SP
+  let Inst{15-5} = 0b00000101000;
+  let Inst{4-0} = mode;
 }
 
-def SRS  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, i32imm:$mode),
-                NoItinerary, "srs${amode}\tsp, $mode",
-                [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b100; // W = 0
-  let Inst{19-8} = 0xd05;
-  let Inst{7-5} = 0b000;
+def SRSDA : SRSI<0, "srsda\tsp, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDA_UPD : SRSI<1, "srsda\tsp!, $mode"> {
+  let Inst{24-23} = 0;
+}
+def SRSDB : SRSI<0, "srsdb\tsp, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSDB_UPD : SRSI<1, "srsdb\tsp!, $mode"> {
+  let Inst{24-23} = 0b10;
+}
+def SRSIA : SRSI<0, "srsia\tsp, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIA_UPD : SRSI<1, "srsia\tsp!, $mode"> {
+  let Inst{24-23} = 0b01;
+}
+def SRSIB : SRSI<0, "srsib\tsp, $mode"> {
+  let Inst{24-23} = 0b11;
+}
+def SRSIB_UPD : SRSI<1, "srsib\tsp!, $mode"> {
+  let Inst{24-23} = 0b11;
 }
 
-// Return From Exception is a system instruction -- for disassembly only
-def RFEW : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
-                NoItinerary, "rfe${amode}\t$base!",
-                [/* For disassembly only; pattern left blank */]> {
+// Return From Exception
+class RFEI<bit wb, string asm>
+  : XI<(outs), (ins GPR:$Rn), AddrModeNone, 4, IndexModeNone, BrFrm,
+       NoItinerary, asm, "", []> {
+  bits<4> Rn;
   let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b011; // W = 1
-  let Inst{15-0} = 0x0a00;
+  let Inst{27-25} = 0b100;
+  let Inst{22} = 0;
+  let Inst{21} = wb;
+  let Inst{20} = 1;
+  let Inst{19-16} = Rn;
+  let Inst{15-0} = 0xa00;
 }
 
-def RFE  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
-                NoItinerary, "rfe${amode}\t$base",
-                [/* For disassembly only; pattern left blank */]> {
-  let Inst{31-28} = 0b1111;
-  let Inst{22-20} = 0b001; // W = 0
-  let Inst{15-0} = 0x0a00;
+def RFEDA : RFEI<0, "rfeda\t$Rn"> {
+  let Inst{24-23} = 0;
+}
+def RFEDA_UPD : RFEI<1, "rfeda\t$Rn!"> {
+  let Inst{24-23} = 0;
+}
+def RFEDB : RFEI<0, "rfedb\t$Rn"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEDB_UPD : RFEI<1, "rfedb\t$Rn!"> {
+  let Inst{24-23} = 0b10;
+}
+def RFEIA : RFEI<0, "rfeia\t$Rn"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIA_UPD : RFEI<1, "rfeia\t$Rn!"> {
+  let Inst{24-23} = 0b01;
+}
+def RFEIB : RFEI<0, "rfeib\t$Rn"> {
+  let Inst{24-23} = 0b11;
+}
+def RFEIB_UPD : RFEI<1, "rfeib\t$Rn!"> {
+  let Inst{24-23} = 0b11;
 }
-} // isCodeGenOnly = 1
 
 //===----------------------------------------------------------------------===//
 //  Load / store Instructions.
@@ -1682,16 +2153,16 @@ def RFE  : ABXI<{1,0,0,?}, (outs), (ins ldstm_mode:$amode, GPR:$base),
 
 defm LDR  : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
                     UnOpFrag<(load node:$Src)>>;
-defm LDRB : AI_ldr1<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
+defm LDRB : AI_ldr1nopc<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
                     UnOpFrag<(zextloadi8 node:$Src)>>;
 defm STR  : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
                    BinOpFrag<(store node:$LHS, node:$RHS)>>;
-defm STRB : AI_str1<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
+defm STRB : AI_str1nopc<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
                    BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 
 // Special LDR for loads from non-pc-relative constpools.
 let canFoldAsLoad = 1, mayLoad = 1, neverHasSideEffects = 1,
-    isReMaterializable = 1 in
+    isReMaterializable = 1, isCodeGenOnly = 1 in
 def LDRcp : AI2ldst<0b010, 1, 0, (outs GPR:$Rt), (ins addrmode_imm12:$addr),
                  AddrMode_i12, LdFrm, IIC_iLoad_r, "ldr", "\t$Rt, $addr",
                  []> {
@@ -1727,34 +2198,65 @@ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
 
 // Indexed loads
 multiclass AI2_ldridx<bit isByte, string opc, InstrItinClass itin> {
-  def _PRE  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                      (ins addrmode2:$addr), IndexModePre, LdFrm, itin,
+  def _PRE_IMM  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins addrmode_imm12:$addr), IndexModePre, LdFrm, itin,
                       opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
-    // {17-14}  Rn
-    // {13}     1 == Rm, 0 == imm12
-    // {12}     isAdd
-    // {11-0}   imm12/Rm
-    bits<18> addr;
-    let Inst{25} = addr{13};
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23} = addr{12};
+    let Inst{19-16} = addr{16-13};
+    let Inst{11-0} = addr{11-0};
+    let DecoderMethod = "DecodeLDRPreImm";
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrModeImm12";
+  }
+
+  def _PRE_REG  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+                      (ins ldst_so_reg:$addr), IndexModePre, LdFrm, itin,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
     let Inst{23} = addr{12};
-    let Inst{19-16} = addr{17-14};
+    let Inst{19-16} = addr{16-13};
     let Inst{11-0} = addr{11-0};
-    let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
+    let Inst{4} = 0;
+    let DecoderMethod = "DecodeLDRPreReg";
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode2";
   }
-  def _POST : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
-                      (ins GPR:$Rn, am2offset:$offset),
+
+  def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                       IndexModePost, LdFrm, itin,
+                       opc, "\t$Rt, $addr, $offset",
+                       "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                       (ins addr_offset_none:$addr, am2offset_imm:$offset),
                       IndexModePost, LdFrm, itin,
-                      opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> {
-    // {13}     1 == Rm, 0 == imm12
+                      opc, "\t$Rt, $addr, $offset",
+                      "$addr.base = $Rn_wb", []> {
     // {12}     isAdd
     // {11-0}   imm12/Rm
     bits<14> offset;
-    bits<4> Rn;
-    let Inst{25} = offset{13};
+    bits<4> addr;
+    let Inst{25} = 0;
     let Inst{23} = offset{12};
-    let Inst{19-16} = Rn;
+    let Inst{19-16} = addr;
     let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
   }
+
 }
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
@@ -1762,8 +2264,8 @@ defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_ru>;
 defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_ru>;
 }
 
-multiclass AI3_ldridx<bits<4> op, bit op20, string opc, InstrItinClass itin> {
-  def _PRE  : AI3ldstidx<op, op20, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
+multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
+  def _PRE  : AI3ldstidx<op, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                         (ins addrmode3:$addr), IndexModePre,
                         LdMiscFrm, itin,
                         opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
@@ -1773,27 +2275,31 @@ multiclass AI3_ldridx<bits<4> op, bit op20, string opc, InstrItinClass itin> {
     let Inst{19-16} = addr{12-9};   // Rn
     let Inst{11-8}  = addr{7-4};    // imm7_4/zero
     let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode3";
+    let DecoderMethod = "DecodeAddrMode3Instruction";
   }
-  def _POST : AI3ldstidx<op, op20, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
-                        (ins GPR:$Rn, am3offset:$offset), IndexModePost,
-                        LdMiscFrm, itin,
-                        opc, "\t$Rt, [$Rn], $offset", "$Rn = $Rn_wb", []> {
+  def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                        (ins addr_offset_none:$addr, am3offset:$offset),
+                        IndexModePost, LdMiscFrm, itin,
+                        opc, "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                        []> {
     bits<10> offset;
-    bits<4> Rn;
+    bits<4> addr;
     let Inst{23}    = offset{8};      // U bit
     let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
-    let Inst{19-16} = Rn;
+    let Inst{19-16} = addr;
     let Inst{11-8}  = offset{7-4};    // imm7_4/zero
     let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+    let DecoderMethod = "DecodeAddrMode3Instruction";
   }
 }
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
-defm LDRH  : AI3_ldridx<0b1011, 1, "ldrh", IIC_iLoad_bh_ru>;
-defm LDRSH : AI3_ldridx<0b1111, 1, "ldrsh", IIC_iLoad_bh_ru>;
-defm LDRSB : AI3_ldridx<0b1101, 1, "ldrsb", IIC_iLoad_bh_ru>;
+defm LDRH  : AI3_ldridx<0b1011, "ldrh", IIC_iLoad_bh_ru>;
+defm LDRSH : AI3_ldridx<0b1111, "ldrsh", IIC_iLoad_bh_ru>;
+defm LDRSB : AI3_ldridx<0b1101, "ldrsb", IIC_iLoad_bh_ru>;
 let hasExtraDefRegAllocReq = 1 in {
-def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
                           (ins addrmode3:$addr), IndexModePre,
                           LdMiscFrm, IIC_iLoad_d_ru,
                           "ldrd", "\t$Rt, $Rt2, $addr!",
@@ -1804,70 +2310,128 @@ def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
   let Inst{19-16} = addr{12-9};   // Rn
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+  let AsmMatchConverter = "cvtLdrdPre";
 }
-def LDRD_POST: AI3ldstidx<0b1101, 0, 1, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
-                          (ins GPR:$Rn, am3offset:$offset), IndexModePost,
-                          LdMiscFrm, IIC_iLoad_d_ru,
-                          "ldrd", "\t$Rt, $Rt2, [$Rn], $offset",
-                          "$Rn = $Rn_wb", []> {
+def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
+                          (ins addr_offset_none:$addr, am3offset:$offset),
+                          IndexModePost, LdMiscFrm, IIC_iLoad_d_ru,
+                          "ldrd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
   bits<10> offset;
-  bits<4> Rn;
+  bits<4> addr;
   let Inst{23}    = offset{8};      // U bit
   let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
-  let Inst{19-16} = Rn;
+  let Inst{19-16} = addr;
   let Inst{11-8}  = offset{7-4};    // imm7_4/zero
   let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 } // hasExtraDefRegAllocReq = 1
 } // mayLoad = 1, neverHasSideEffects = 1
 
-// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT are for disassembly only.
+// LDRT, LDRBT, LDRSBT, LDRHT, LDRSHT.
 let mayLoad = 1, neverHasSideEffects = 1 in {
-def LDRT : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$base_wb),
-                   (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_ru,
-                   "ldrt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
-  // {17-14}  Rn
-  // {13}     1 == Rm, 0 == imm12
+def LDRT_POST_REG : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_ru,
+                    "ldrt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
-  bits<18> addr;
-  let Inst{25} = addr{13};
-  let Inst{23} = addr{12};
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-  let Inst{19-16} = addr{17-14};
-  let Inst{11-0} = addr{11-0};
-  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
-}
-def LDRBT : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-                  (ins addrmode2:$addr), IndexModePost, LdFrm, IIC_iLoad_bh_ru,
-                  "ldrbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
-  // {17-14}  Rn
-  // {13}     1 == Rm, 0 == imm12
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRT_POST_IMM : AI2ldstidx<1, 0, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                    (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, LdFrm, IIC_iLoad_ru,
+                   "ldrt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
   // {12}     isAdd
   // {11-0}   imm12/Rm
-  bits<18> addr;
-  let Inst{25} = addr{13};
-  let Inst{23} = addr{12};
-  let Inst{21} = 1; // overwrite
-  let Inst{19-16} = addr{17-14};
-  let Inst{11-0} = addr{11-0};
-  let AsmMatchConverter = "CvtLdWriteBackRegAddrMode2";
-}
-def LDRSBT : AI3ldstidxT<0b1101, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
-             "ldrsbt", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
-def LDRHT  : AI3ldstidxT<0b1011, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
-             "ldrht", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+
+def LDRBT_POST_REG : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_reg:$offset),
+                     IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                     "ldrbt", "\t$Rt, $addr, $offset",
+                     "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-}
-def LDRSHT : AI3ldstidxT<0b1111, 1, 1, 0, (outs GPR:$Rt, GPR:$base_wb),
-             (ins addrmode3:$addr), IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru,
-             "ldrsht", "\t$Rt, $addr", "$addr.base = $base_wb", []> {
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def LDRBT_POST_IMM : AI2ldstidx<1, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                     (ins addr_offset_none:$addr, am2offset_imm:$offset),
+                    IndexModePost, LdFrm, IIC_iLoad_bh_ru,
+                    "ldrbt", "\t$Rt, $addr, $offset",
+                    "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+multiclass AI3ldrT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_imm8:$offset),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+    let AsmMatchConverter = "cvtLdExtTWriteBackImm";
+  }
+  def r : AI3ldstidxT<op, 1, (outs GPR:$Rt, GPR:$base_wb),
+                      (ins addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, LdMiscFrm, IIC_iLoad_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Inst{3-0} = Rm{3-0};
+    let AsmMatchConverter = "cvtLdExtTWriteBackReg";
+  }
 }
+
+defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
+defm LDRHT  : AI3ldrT<0b1011, "ldrht">;
+defm LDRSHT : AI3ldrT<0b1111, "ldrsht">;
 }
 
 // Store
@@ -1881,98 +2445,302 @@ def STRH : AI3str<0b1011, (outs), (ins GPR:$Rt, addrmode3:$addr), StMiscFrm,
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def STRD : AI3str<0b1111, (outs), (ins GPR:$Rt, GPR:$src2, addrmode3:$addr),
                StMiscFrm, IIC_iStore_d_r,
-               "strd", "\t$Rt, $src2, $addr", []>, Requires<[IsARM, HasV5TE]>;
+               "strd", "\t$Rt, $src2, $addr", []>,
+           Requires<[IsARM, HasV5TE]> {
+  let Inst{21} = 0;
+}
 
 // Indexed stores
-def STR_PRE  : AI2stridx<0, 1, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePre, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn, $offset]!",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb,
-                      (pre_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
-
-def STR_POST : AI2stridx<0, 0, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePost, StFrm, IIC_iStore_ru,
-                     "str", "\t$Rt, [$Rn], $offset",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb,
-                      (post_store GPR:$Rt, GPR:$Rn, am2offset:$offset))]>;
-
-def STRB_PRE : AI2stridx<1, 1, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePre, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn, $offset]!",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb, (pre_truncsti8 GPR:$Rt,
-                                        GPR:$Rn, am2offset:$offset))]>;
-def STRB_POST: AI2stridx<1, 0, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am2offset:$offset),
-                     IndexModePost, StFrm, IIC_iStore_bh_ru,
-                     "strb", "\t$Rt, [$Rn], $offset",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb, (post_truncsti8 GPR:$Rt,
-                                        GPR:$Rn, am2offset:$offset))]>;
-
-def STRH_PRE : AI3stridx<0b1011, 0, 1, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
-                     IndexModePre, StMiscFrm, IIC_iStore_ru,
-                     "strh", "\t$Rt, [$Rn, $offset]!",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb,
-                      (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
-
-def STRH_POST: AI3stridx<0b1011, 0, 0, (outs GPR:$Rn_wb),
-                     (ins GPR:$Rt, GPR:$Rn, am3offset:$offset),
-                     IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
-                     "strh", "\t$Rt, [$Rn], $offset",
-                     "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
-                     [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
-                                        GPR:$Rn, am3offset:$offset))]>;
-
-// For disassembly only
+multiclass AI2_stridx<bit isByte, string opc, InstrItinClass itin> {
+  def _PRE_IMM : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                            (ins GPR:$Rt, addrmode_imm12:$addr), IndexModePre,
+                            StFrm, itin,
+                            opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 0;
+    let Inst{23}    = addr{12};     // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13};  // Rn
+    let Inst{11-0}  = addr{11-0};   // imm12
+    let AsmMatchConverter = "cvtStWriteBackRegAddrModeImm12";
+    let DecoderMethod = "DecodeSTRPreImm";
+  }
+
+  def _PRE_REG  : AI2ldstidx<0, isByte, 1, (outs GPR:$Rn_wb),
+                      (ins GPR:$Rt, ldst_so_reg:$addr),
+                      IndexModePre, StFrm, itin,
+                      opc, "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+    bits<17> addr;
+    let Inst{25} = 1;
+    let Inst{23}    = addr{12};    // U (add = ('U' == 1))
+    let Inst{19-16} = addr{16-13}; // Rn
+    let Inst{11-0}  = addr{11-0};
+    let Inst{4}     = 0;           // Inst{4} = 0
+    let AsmMatchConverter = "cvtStWriteBackRegAddrMode2";
+    let DecoderMethod = "DecodeSTRPreReg";
+  }
+  def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                IndexModePost, StFrm, itin,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb", []> {
+     // {12}     isAdd
+     // {11-0}   imm12/Rm
+     bits<14> offset;
+     bits<4> addr;
+     let Inst{25} = 1;
+     let Inst{23} = offset{12};
+     let Inst{19-16} = addr;
+     let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+   }
+
+   def _POST_IMM : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
+                (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                IndexModePost, StFrm, itin,
+                opc, "\t$Rt, $addr, $offset",
+                "$addr.base = $Rn_wb", []> {
+    // {12}     isAdd
+    // {11-0}   imm12/Rm
+    bits<14> offset;
+    bits<4> addr;
+    let Inst{25} = 0;
+    let Inst{23} = offset{12};
+    let Inst{19-16} = addr;
+    let Inst{11-0} = offset{11-0};
+
+    let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+  }
+}
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm STR  : AI2_stridx<0, "str", IIC_iStore_ru>;
+defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_ru>;
+}
+
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_reg:$offset),
+             (STR_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_reg:$offset)>;
+def : ARMPat<(post_store GPR:$Rt, addr_offset_none:$addr,
+                         am2offset_imm:$offset),
+             (STR_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                           am2offset_imm:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_reg:$offset),
+             (STRB_POST_REG GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_reg:$offset)>;
+def : ARMPat<(post_truncsti8 GPR:$Rt, addr_offset_none:$addr,
+                             am2offset_imm:$offset),
+             (STRB_POST_IMM GPR:$Rt, addr_offset_none:$addr,
+                            am2offset_imm:$offset)>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def STRi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_store GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRBi_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_imm:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_imm:$offset))]>;
+def STRBr_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am2offset_reg:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti8 GPR:$Rt, GPR:$Rn, am2offset_reg:$offset))]>;
+def STRH_preidx: ARMPseudoInst<(outs GPR:$Rn_wb),
+               (ins GPR:$Rt, GPR:$Rn, am3offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+            [(set GPR:$Rn_wb,
+                  (pre_truncsti16 GPR:$Rt, GPR:$Rn, am3offset:$offset))]>;
+}
+
+
+
+def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
+                           (ins GPR:$Rt, addrmode3:$addr), IndexModePre,
+                           StMiscFrm, IIC_iStore_bh_ru,
+                           "strh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let AsmMatchConverter = "cvtStWriteBackRegAddrMode3";
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
+def STRH_POST : AI3ldstidx<0b1011, 0, 0, (outs GPR:$Rn_wb),
+                       (ins GPR:$Rt, addr_offset_none:$addr, am3offset:$offset),
+                       IndexModePost, StMiscFrm, IIC_iStore_bh_ru,
+                       "strh", "\t$Rt, $addr, $offset", "$addr.base = $Rn_wb",
+                   [(set GPR:$Rn_wb, (post_truncsti16 GPR:$Rt,
+                                                      addr_offset_none:$addr,
+                                                      am3offset:$offset))]> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
+
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
-def STRD_PRE : AI3stdpr<(outs GPR:$base_wb),
-                     (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
-                     StMiscFrm, IIC_iStore_d_ru,
-                     "strd", "\t$src1, $src2, [$base, $offset]!",
-                     "$base = $base_wb", []>;
-
-// For disassembly only
-def STRD_POST: AI3stdpo<(outs GPR:$base_wb),
-                     (ins GPR:$src1, GPR:$src2, GPR:$base, am3offset:$offset),
-                     StMiscFrm, IIC_iStore_d_ru,
-                     "strd", "\t$src1, $src2, [$base], $offset",
-                     "$base = $base_wb", []>;
+def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
+                          IndexModePre, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr!",
+                          "$addr.base = $Rn_wb", []> {
+  bits<14> addr;
+  let Inst{23}    = addr{8};      // U bit
+  let Inst{22}    = addr{13};     // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr{12-9};   // Rn
+  let Inst{11-8}  = addr{7-4};    // imm7_4/zero
+  let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+  let AsmMatchConverter = "cvtStrdPre";
+}
+
+def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
+                          (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr,
+                               am3offset:$offset),
+                          IndexModePost, StMiscFrm, IIC_iStore_d_ru,
+                          "strd", "\t$Rt, $Rt2, $addr, $offset",
+                          "$addr.base = $Rn_wb", []> {
+  bits<10> offset;
+  bits<4> addr;
+  let Inst{23}    = offset{8};      // U bit
+  let Inst{22}    = offset{9};      // 1 == imm8, 0 == Rm
+  let Inst{19-16} = addr;
+  let Inst{11-8}  = offset{7-4};    // imm7_4/zero
+  let Inst{3-0}   = offset{3-0};    // imm3_0/Rm
+  let DecoderMethod = "DecodeAddrMode3Instruction";
+}
 } // mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1
 
-// STRT, STRBT, and STRHT are for disassembly only.
+// STRT, STRBT, and STRHT
 
-def STRT : AI2stridxT<0, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr),
-                     IndexModePost, StFrm, IIC_iStore_ru,
-                     "strt", "\t$Rt, $addr", "$addr.base = $Rn_wb",
-                     [/* For disassembly only; pattern left blank */]> {
+def STRBT_POST_REG : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-  let AsmMatchConverter = "CvtStWriteBackRegAddrMode2";
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRBT_POST_IMM : AI2ldstidx<0, 1, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, StFrm, IIC_iStore_bh_ru,
+                   "strbt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
 }
 
-def STRBT : AI2stridxT<1, 0, (outs GPR:$Rn_wb), (ins GPR:$Rt, addrmode2:$addr),
-                      IndexModePost, StFrm, IIC_iStore_bh_ru,
-                      "strbt", "\t$Rt, $addr", "$addr.base = $Rn_wb",
-                      [/* For disassembly only; pattern left blank */]> {
+let mayStore = 1, neverHasSideEffects = 1 in {
+def STRT_POST_REG : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_reg:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 1;
+  let Inst{23} = offset{12};
+  let Inst{21} = 1; // overwrite
+  let Inst{19-16} = addr;
+  let Inst{11-5} = offset{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = offset{3-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
+
+def STRT_POST_IMM : AI2ldstidx<0, 0, 0, (outs GPR:$Rn_wb),
+                   (ins GPR:$Rt, addr_offset_none:$addr, am2offset_imm:$offset),
+                   IndexModePost, StFrm, IIC_iStore_ru,
+                   "strt", "\t$Rt, $addr, $offset",
+                   "$addr.base = $Rn_wb", []> {
+  // {12}     isAdd
+  // {11-0}   imm12/Rm
+  bits<14> offset;
+  bits<4> addr;
+  let Inst{25} = 0;
+  let Inst{23} = offset{12};
   let Inst{21} = 1; // overwrite
-  let AsmMatchConverter = "CvtStWriteBackRegAddrMode2";
+  let Inst{19-16} = addr;
+  let Inst{11-0} = offset{11-0};
+  let DecoderMethod = "DecodeAddrMode2IdxInstruction";
+}
 }
 
-def STRHT: AI3sthpo<(outs GPR:$base_wb), (ins GPR:$Rt, addrmode3:$addr),
-                    StMiscFrm, IIC_iStore_bh_ru,
-                    "strht", "\t$Rt, $addr", "$addr.base = $base_wb",
-                    [/* For disassembly only; pattern left blank */]> {
-  let Inst{21} = 1; // overwrite
-  let AsmMatchConverter = "CvtStWriteBackRegAddrMode3";
+
+multiclass AI3strT<bits<4> op, string opc> {
+  def i : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                    (ins GPR:$Rt, addr_offset_none:$addr, postidx_imm8:$offset),
+                    IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                    "\t$Rt, $addr, $offset", "$addr.base = $base_wb", []> {
+    bits<9> offset;
+    let Inst{23} = offset{8};
+    let Inst{22} = 1;
+    let Inst{11-8} = offset{7-4};
+    let Inst{3-0} = offset{3-0};
+    let AsmMatchConverter = "cvtStExtTWriteBackImm";
+  }
+  def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
+                      (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
+                      IndexModePost, StMiscFrm, IIC_iStore_bh_ru, opc,
+                      "\t$Rt, $addr, $Rm", "$addr.base = $base_wb", []> {
+    bits<5> Rm;
+    let Inst{23} = Rm{4};
+    let Inst{22} = 0;
+    let Inst{11-8} = 0;
+    let Inst{3-0} = Rm{3-0};
+    let AsmMatchConverter = "cvtStExtTWriteBackReg";
+  }
 }
 
+
+defm STRHT : AI3strT<0b1011, "strht">;
+
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -1996,6 +2764,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b01;       // Increment After
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
   def DA :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
@@ -2012,6 +2782,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b00;       // Decrement After
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
   def DB :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
@@ -2028,6 +2800,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b10;       // Decrement Before
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
   def IB :
     AXI4<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
@@ -2044,6 +2818,8 @@ multiclass arm_ldst_mult<string asm, bit L_bit, Format f,
     let Inst{24-23} = 0b11;       // Increment Before
     let Inst{21}    = 1;          // Writeback
     let Inst{20}    = L_bit;
+
+    let DecoderMethod = "DecodeMemMultipleWritebackInstruction";
   }
 }
 
@@ -2084,6 +2860,9 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
   let Inst{15-12} = Rd;
 }
 
+def : ARMInstAlias<"movs${p} $Rd, $Rm", 
+                   (MOVr GPR:$Rd, GPR:$Rm, pred:$p, CPSR)>;
+
 // A version for the smaller set of tail call registers.
 let neverHasSideEffects = 1 in
 def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
@@ -2097,15 +2876,33 @@ def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
   let Inst{15-12} = Rd;
 }
 
-def MOVs : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg:$src),
-                DPSoRegFrm, IIC_iMOVsr,
-                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg:$src)]>,
+def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
+                DPSoRegRegFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src",
+                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> src;
+  let Inst{15-12} = Rd;
+  let Inst{19-16} = 0b0000;
+  let Inst{11-8} = src{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = src{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = src{3-0};
+  let Inst{25} = 0;
+}
+
+def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
+                DPSoRegImmFrm, IIC_iMOVsr,
+                "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
                 UnaryDP {
   bits<4> Rd;
   bits<12> src;
   let Inst{15-12} = Rd;
   let Inst{19-16} = 0b0000;
-  let Inst{11-0} = src;
+  let Inst{11-5} = src{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = src{3-0};
   let Inst{25} = 0;
 }
 
@@ -2121,7 +2918,7 @@ def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
-def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins i32imm_hilo16:$imm),
+def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
                  DPFrm, IIC_iMOVi,
                  "movw", "\t$Rd, $imm",
                  [(set GPR:$Rd, imm0_65535:$imm)]>,
@@ -2133,16 +2930,22 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins i32imm_hilo16:$imm),
   let Inst{19-16} = imm{15-12};
   let Inst{20} = 0;
   let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
 }
 
+def : InstAlias<"mov${p} $Rd, $imm",
+                (MOVi16 GPR:$Rd, imm0_65535_expr:$imm, pred:$p)>,
+        Requires<[IsARM]>;
+
 def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
 
 let Constraints = "$src = $Rd" in {
-def MOVTi16 : AI1<0b1010, (outs GPR:$Rd), (ins GPR:$src, i32imm_hilo16:$imm),
+def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
+                  (ins GPR:$src, imm0_65535_expr:$imm),
                   DPFrm, IIC_iMOVi,
                   "movt", "\t$Rd, $imm",
-                  [(set GPR:$Rd,
+                  [(set GPRnopc:$Rd,
                         (or (and GPR:$src, 0xffff),
                             lo16AllZero:$imm))]>, UnaryDP,
                   Requires<[IsARM, HasV6T2]> {
@@ -2153,6 +2956,7 @@ def MOVTi16 : AI1<0b1010, (outs GPR:$Rd), (ins GPR:$src, i32imm_hilo16:$imm),
   let Inst{19-16} = imm{15-12};
   let Inst{20} = 0;
   let Inst{25} = 1;
+  let DecoderMethod = "DecodeArmMOVTWInstruction";
 }
 
 def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
@@ -2186,30 +2990,28 @@ def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
 
 // Sign extenders
 
-defm SXTB  : AI_ext_rrot<0b01101010,
+def SXTB  : AI_ext_rrot<0b01101010,
                          "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
-defm SXTH  : AI_ext_rrot<0b01101011,
+def SXTH  : AI_ext_rrot<0b01101011,
                          "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
 
-defm SXTAB : AI_exta_rrot<0b01101010,
+def SXTAB : AI_exta_rrot<0b01101010,
                "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-defm SXTAH : AI_exta_rrot<0b01101011,
+def SXTAH : AI_exta_rrot<0b01101011,
                "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
 
-// For disassembly only
-defm SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
+def SXTB16  : AI_ext_rrot_np<0b01101000, "sxtb16">;
 
-// For disassembly only
-defm SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
+def SXTAB16 : AI_exta_rrot_np<0b01101000, "sxtab16">;
 
 // Zero extenders
 
 let AddedComplexity = 16 in {
-defm UXTB   : AI_ext_rrot<0b01101110,
+def UXTB   : AI_ext_rrot<0b01101110,
                           "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
-defm UXTH   : AI_ext_rrot<0b01101111,
+def UXTH   : AI_ext_rrot<0b01101111,
                           "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm UXTB16 : AI_ext_rrot<0b01101100,
+def UXTB16 : AI_ext_rrot<0b01101100,
                           "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
@@ -2217,23 +3019,22 @@ defm UXTB16 : AI_ext_rrot<0b01101100,
 //        instead so we can include a check for masking back in the upper
 //        eight bits of the source into the lower eight bits of the result.
 //def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
-//               (UXTB16r_rot GPR:$Src, 24)>;
+//               (UXTB16r_rot GPR:$Src, 3)>;
 def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
-               (UXTB16r_rot GPR:$Src, 8)>;
+               (UXTB16 GPR:$Src, 1)>;
 
-defm UXTAB : AI_exta_rrot<0b01101110, "uxtab",
+def UXTAB : AI_exta_rrot<0b01101110, "uxtab",
                         BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-defm UXTAH : AI_exta_rrot<0b01101111, "uxtah",
+def UXTAH : AI_exta_rrot<0b01101111, "uxtah",
                         BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
 }
 
 // This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
-// For disassembly only
-defm UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
+def UXTAB16 : AI_exta_rrot_np<0b01101100, "uxtab16">;
 
 
-def SBFX  : I<(outs GPR:$Rd),
-              (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width),
+def SBFX  : I<(outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, imm0_31:$lsb, imm1_32:$width),
                AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
                "sbfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
                Requires<[IsARM, HasV6T2]> {
@@ -2250,7 +3051,7 @@ def SBFX  : I<(outs GPR:$Rd),
 }
 
 def UBFX  : I<(outs GPR:$Rd),
-              (ins GPR:$Rn, imm0_31:$lsb, imm0_31_m1:$width),
+              (ins GPR:$Rn, imm0_31:$lsb, imm1_32:$width),
                AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
                "ubfx", "\t$Rd, $Rn, $lsb, $width", "", []>,
                Requires<[IsARM, HasV6T2]> {
@@ -2278,148 +3079,58 @@ defm SUB  : AsI1_bin_irs<0b0010, "sub",
                          BinOpFrag<(sub  node:$LHS, node:$RHS)>, "SUB">;
 
 // ADD and SUB with 's' bit set.
-defm ADDS : AI1_bin_s_irs<0b0100, "adds",
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
+defm ADDS : AsI1_bin_s_irs<0b0100, "add",
                           IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                          BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
-defm SUBS : AI1_bin_s_irs<0b0010, "subs",
+                          BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
+defm SUBS : AsI1_bin_s_irs<0b0010, "sub",
                           IIC_iALUi, IIC_iALUr, IIC_iALUsr,
-                          BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+                          BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
 defm ADC : AI1_adde_sube_irs<0b0101, "adc",
-                          BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>,
+                  BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>,
                           "ADC", 1>;
 defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
-                          BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>,
+                  BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>,
                           "SBC">;
 
-// ADC and SUBC with 's' bit set.
-let usesCustomInserter = 1 in {
-defm ADCS : AI1_adde_sube_s_irs<
-              BinOpFrag<(adde_live_carry node:$LHS, node:$RHS)>, 1>;
-defm SBCS : AI1_adde_sube_s_irs<
-              BinOpFrag<(sube_live_carry node:$LHS, node:$RHS) >>;
-}
-
-def RSBri : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm), DPFrm,
-                 IIC_iALUi, "rsb", "\t$Rd, $Rn, $imm",
-                 [(set GPR:$Rd, (sub so_imm:$imm, GPR:$Rn))]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-  let Inst{11-0} = imm;
-}
-
-// The reg/reg form is only defined for the disassembler; for codegen it is
-// equivalent to SUBrr.
-def RSBrr : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm,
-                 IIC_iALUr, "rsb", "\t$Rd, $Rn, $Rm",
-                 [/* For disassembly only; pattern left blank */]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<4> Rm;
-  let Inst{11-4} = 0b00000000;
-  let Inst{25} = 0;
-  let Inst{3-0} = Rm;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
-
-def RSBrs : AsI1<0b0011, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                 DPSoRegFrm, IIC_iALUsr, "rsb", "\t$Rd, $Rn, $shift",
-                 [(set GPR:$Rd, (sub so_reg:$shift, GPR:$Rn))]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> shift;
-  let Inst{25} = 0;
-  let Inst{11-0} = shift;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
+defm RSB  : AsI1_rbin_irs <0b0011, "rsb",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(sub node:$LHS, node:$RHS)>, "RSB">;
 
-// RSB with 's' bit set.
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1 in {
-def RSBSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                 4, IIC_iALUi,
-                 [(set GPR:$Rd, (subc so_imm:$imm, GPR:$Rn))]>;
-def RSBSrr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                 4, IIC_iALUr,
-                 [/* For disassembly only; pattern left blank */]>;
-def RSBSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                 4, IIC_iALUsr,
-                 [(set GPR:$Rd, (subc so_reg:$shift, GPR:$Rn))]>;
-}
-
-let Uses = [CPSR] in {
-def RSCri : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                 DPFrm, IIC_iALUi, "rsc", "\t$Rd, $Rn, $imm",
-                 [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>,
-                 Requires<[IsARM]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> imm;
-  let Inst{25} = 1;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-  let Inst{11-0} = imm;
-}
-// The reg/reg form is only defined for the disassembler; for codegen it is
-// equivalent to SUBrr.
-def RSCrr : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                 DPFrm, IIC_iALUr, "rsc", "\t$Rd, $Rn, $Rm",
-                 [/* For disassembly only; pattern left blank */]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<4> Rm;
-  let Inst{11-4} = 0b00000000;
-  let Inst{25} = 0;
-  let Inst{3-0} = Rm;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
-def RSCrs : AsI1<0b0111, (outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                 DPSoRegFrm, IIC_iALUsr, "rsc", "\t$Rd, $Rn, $shift",
-                 [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>,
-                 Requires<[IsARM]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<12> shift;
-  let Inst{25} = 0;
-  let Inst{11-0} = shift;
-  let Inst{15-12} = Rd;
-  let Inst{19-16} = Rn;
-}
-}
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
+defm RSBS : AsI1_rbin_s_is<0b0011, "rsb",
+                         IIC_iALUi, IIC_iALUr, IIC_iALUsr,
+                         BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1, Uses = [CPSR] in {
-def RSCSri : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_imm:$imm),
-                  4, IIC_iALUi,
-                  [(set GPR:$Rd, (sube_dead_carry so_imm:$imm, GPR:$Rn))]>;
-def RSCSrs : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$Rn, so_reg:$shift),
-                  4, IIC_iALUsr,
-                  [(set GPR:$Rd, (sube_dead_carry so_reg:$shift, GPR:$Rn))]>;
-}
+defm RSC : AI1_rsc_irs<0b0111, "rsc",
+                  BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>,
+                       "RSC">;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
 // assume opposite meanings of the carry flag (i.e., carry == !borrow).
 // See the definition of AddWithCarry() in the ARM ARM A2.2.1 for the gory
 // details.
-def : ARMPat<(add    GPR:$src, so_imm_neg:$imm),
-             (SUBri  GPR:$src, so_imm_neg:$imm)>;
-def : ARMPat<(addc   GPR:$src, so_imm_neg:$imm),
-             (SUBSri GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(add     GPR:$src, so_imm_neg:$imm),
+             (SUBri   GPR:$src, so_imm_neg:$imm)>;
+def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
+             (SUBSri  GPR:$src, so_imm_neg:$imm)>;
+
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
-def : ARMPat<(adde_dead_carry   GPR:$src, so_imm_not:$imm),
-             (SBCri  GPR:$src, so_imm_not:$imm)>;
-def : ARMPat<(adde_live_carry   GPR:$src, so_imm_not:$imm),
-             (SBCSri GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
+             (SBCri   GPR:$src, so_imm_not:$imm)>;
 
 // Note: These are implemented in C++ code, because they have to generate
 // ADD/SUBrs instructions, which use a complex pattern that a xform function
@@ -2427,12 +3138,13 @@ def : ARMPat<(adde_live_carry   GPR:$src, so_imm_not:$imm),
 // (mul X, 2^n+1) -> (add (X << n), X)
 // (mul X, 2^n-1) -> (rsb X, (X << n))
 
-// ARM Arithmetic Instruction -- for disassembly only
+// ARM Arithmetic Instruction
 // GPR:$dst = GPR:$a op GPR:$b
 class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
-          list<dag> pattern = [/* For disassembly only; pattern left blank */],
-          dag iops = (ins GPR:$Rn, GPR:$Rm), string asm = "\t$Rd, $Rn, $Rm">
-  : AI<(outs GPR:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
+          list<dag> pattern = [],
+          dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
+          string asm = "\t$Rd, $Rn, $Rm">
+  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
   bits<4> Rn;
   bits<4> Rd;
   bits<4> Rm;
@@ -2443,17 +3155,19 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
   let Inst{3-0}   = Rm;
 }
 
-// Saturating add/subtract -- for disassembly only
+// Saturating add/subtract
 
 def QADD    : AAI<0b00010000, 0b00000101, "qadd",
-                  [(set GPR:$Rd, (int_arm_qadd GPR:$Rm, GPR:$Rn))],
-                  (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">;
+                  [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
 def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
-                  [(set GPR:$Rd, (int_arm_qsub GPR:$Rm, GPR:$Rn))],
-                  (ins GPR:$Rm, GPR:$Rn), "\t$Rd, $Rm, $Rn">;
-def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [], (ins GPR:$Rm, GPR:$Rn),
+                  [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+def QDADD   : AAI<0b00010100, 0b00000101, "qdadd", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
                   "\t$Rd, $Rm, $Rn">;
-def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [], (ins GPR:$Rm, GPR:$Rn),
+def QDSUB   : AAI<0b00010110, 0b00000101, "qdsub", [],
+                  (ins GPRnopc:$Rm, GPRnopc:$Rn),
                   "\t$Rd, $Rm, $Rn">;
 
 def QADD16  : AAI<0b01100010, 0b11110001, "qadd16">;
@@ -2469,7 +3183,7 @@ def UQSAX   : AAI<0b01100110, 0b11110101, "uqsax">;
 def UQSUB16 : AAI<0b01100110, 0b11110111, "uqsub16">;
 def UQSUB8  : AAI<0b01100110, 0b11111111, "uqsub8">;
 
-// Signed/Unsigned add/subtract -- for disassembly only
+// Signed/Unsigned add/subtract
 
 def SASX   : AAI<0b01100001, 0b11110011, "sasx">;
 def SADD16 : AAI<0b01100001, 0b11110001, "sadd16">;
@@ -2484,7 +3198,7 @@ def USAX   : AAI<0b01100101, 0b11110101, "usax">;
 def USUB16 : AAI<0b01100101, 0b11110111, "usub16">;
 def USUB8  : AAI<0b01100101, 0b11111111, "usub8">;
 
-// Signed/Unsigned halving add/subtract -- for disassembly only
+// Signed/Unsigned halving add/subtract
 
 def SHASX   : AAI<0b01100011, 0b11110011, "shasx">;
 def SHADD16 : AAI<0b01100011, 0b11110001, "shadd16">;
@@ -2499,7 +3213,7 @@ def UHSAX   : AAI<0b01100111, 0b11110101, "uhsax">;
 def UHSUB16 : AAI<0b01100111, 0b11110111, "uhsub16">;
 def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
 
-// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
+// Unsigned Sum of Absolute Differences [and Accumulate].
 
 def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 MulFrm /* for convenience */, NoItinerary, "usad8",
@@ -2531,11 +3245,11 @@ def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
   let Inst{3-0} = Rn;
 }
 
-// Signed/Unsigned saturate -- for disassembly only
+// Signed/Unsigned saturate
 
-def SSAT : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$a, shift_imm:$sh),
-              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $a$sh",
-              [/* For disassembly only; pattern left blank */]> {
+def SSAT : AI<(outs GPRnopc:$Rd),
+              (ins imm1_32:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -2544,14 +3258,14 @@ def SSAT : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$a, shift_imm:$sh),
   let Inst{5-4} = 0b01;
   let Inst{20-16} = sat_imm;
   let Inst{15-12} = Rd;
-  let Inst{11-7} = sh{7-3};
-  let Inst{6} = sh{0};
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
   let Inst{3-0} = Rn;
 }
 
-def SSAT16 : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$Rn), SatFrm,
-                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn",
-                [/* For disassembly only; pattern left blank */]> {
+def SSAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm1_16:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []> {
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
@@ -2562,9 +3276,9 @@ def SSAT16 : AI<(outs GPR:$Rd), (ins ssat_imm:$sat_imm, GPR:$Rn), SatFrm,
   let Inst{3-0} = Rn;
 }
 
-def USAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
-              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $a$sh",
-              [/* For disassembly only; pattern left blank */]> {
+def USAT : AI<(outs GPRnopc:$Rd),
+              (ins imm0_31:$sat_imm, GPRnopc:$Rn, shift_imm:$sh),
+              SatFrm, NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   bits<4> Rd;
   bits<5> sat_imm;
   bits<4> Rn;
@@ -2572,15 +3286,15 @@ def USAT : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a, shift_imm:$sh),
   let Inst{27-21} = 0b0110111;
   let Inst{5-4} = 0b01;
   let Inst{15-12} = Rd;
-  let Inst{11-7} = sh{7-3};
-  let Inst{6} = sh{0};
+  let Inst{11-7} = sh{4-0};
+  let Inst{6} = sh{5};
   let Inst{20-16} = sat_imm;
   let Inst{3-0} = Rn;
 }
 
-def USAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a), SatFrm,
-                NoItinerary, "usat16", "\t$Rd, $sat_imm, $a",
-                [/* For disassembly only; pattern left blank */]> {
+def USAT16 : AI<(outs GPRnopc:$Rd),
+                (ins imm0_15:$sat_imm, GPRnopc:$Rn), SatFrm,
+                NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []> {
   bits<4> Rd;
   bits<4> sat_imm;
   bits<4> Rn;
@@ -2591,8 +3305,10 @@ def USAT16 : AI<(outs GPR:$Rd), (ins i32imm:$sat_imm, GPR:$a), SatFrm,
   let Inst{3-0} = Rn;
 }
 
-def : ARMV6Pat<(int_arm_ssat GPR:$a, imm:$pos), (SSAT imm:$pos, GPR:$a, 0)>;
-def : ARMV6Pat<(int_arm_usat GPR:$a, imm:$pos), (USAT imm:$pos, GPR:$a, 0)>;
+def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos),
+               (SSAT imm:$pos, GPRnopc:$a, 0)>;
+def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos),
+               (USAT imm:$pos, GPRnopc:$a, 0)>;
 
 //===----------------------------------------------------------------------===//
 //  Bitwise Instructions.
@@ -2611,6 +3327,10 @@ defm BIC   : AsI1_bin_irs<0b1110, "bic",
                           IIC_iBITi, IIC_iBITr, IIC_iBITsr,
                           BinOpFrag<(and node:$LHS, (not node:$RHS))>, "BIC">;
 
+// FIXME: bf_inv_mask_imm should be two operands, the lsb and the msb, just
+// like in the actual instruction encoding. The complexity of mapping the mask
+// to the lsb/msb pair should be handled by ISel, not encapsulated in the
+// instruction description.
 def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
                AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
                "bfc", "\t$Rd, $imm", "$src = $Rd",
@@ -2622,16 +3342,16 @@ def BFC    : I<(outs GPR:$Rd), (ins GPR:$src, bf_inv_mask_imm:$imm),
   let Inst{6-0}   = 0b0011111;
   let Inst{15-12} = Rd;
   let Inst{11-7}  = imm{4-0}; // lsb
-  let Inst{20-16} = imm{9-5}; // width
+  let Inst{20-16} = imm{9-5}; // msb
 }
 
 // A8.6.18  BFI - Bitfield insert (Encoding A1)
-def BFI    : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
-               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
-               "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
-               [(set GPR:$Rd, (ARMbfi GPR:$src, GPR:$Rn,
-                                bf_inv_mask_imm:$imm))]>,
-               Requires<[IsARM, HasV6T2]> {
+def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
+          AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
+          "bfi", "\t$Rd, $Rn, $imm", "$src = $Rd",
+          [(set GPRnopc:$Rd, (ARMbfi GPRnopc:$src, GPR:$Rn,
+                           bf_inv_mask_imm:$imm))]>,
+          Requires<[IsARM, HasV6T2]> {
   bits<4> Rd;
   bits<4> Rn;
   bits<10> imm;
@@ -2643,25 +3363,6 @@ def BFI    : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
   let Inst{3-0}   = Rn;
 }
 
-// GNU as only supports this form of bfi (w/ 4 arguments)
-let isAsmParserOnly = 1 in
-def BFI4p : I<(outs GPR:$Rd), (ins GPR:$src, GPR:$Rn,
-                                   lsb_pos_imm:$lsb, width_imm:$width),
-               AddrMode1, 4, IndexModeNone, DPFrm, IIC_iUNAsi,
-               "bfi", "\t$Rd, $Rn, $lsb, $width", "$src = $Rd",
-               []>, Requires<[IsARM, HasV6T2]> {
-  bits<4> Rd;
-  bits<4> Rn;
-  bits<5> lsb;
-  bits<5> width;
-  let Inst{27-21} = 0b0111110;
-  let Inst{6-4}   = 0b001; // Rn: Inst{3-0} != 15
-  let Inst{15-12} = Rd;
-  let Inst{11-7}  = lsb;
-  let Inst{20-16} = width; // Custom encoder => lsb+width-1
-  let Inst{3-0}   = Rn;
-}
-
 def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
                   "mvn", "\t$Rd, $Rm",
                   [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP {
@@ -2673,15 +3374,31 @@ def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
   let Inst{15-12} = Rd;
   let Inst{3-0} = Rm;
 }
-def  MVNs  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg:$shift), DPSoRegFrm,
-                  IIC_iMVNsr, "mvn", "\t$Rd, $shift",
-                  [(set GPR:$Rd, (not so_reg:$shift))]>, UnaryDP {
+def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
+                  DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP {
+  bits<4> Rd;
+  bits<12> shift;
+  let Inst{25} = 0;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-12} = Rd;
+  let Inst{11-5} = shift{11-5};
+  let Inst{4} = 0;
+  let Inst{3-0} = shift{3-0};
+}
+def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
+                  DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
+                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP {
   bits<4> Rd;
   bits<12> shift;
   let Inst{25} = 0;
   let Inst{19-16} = 0b0000;
   let Inst{15-12} = Rd;
-  let Inst{11-0} = shift;
+  let Inst{11-8} = shift{11-8};
+  let Inst{7} = 0;
+  let Inst{6-5} = shift{6-5};
+  let Inst{4} = 1;
+  let Inst{3-0} = shift{3-0};
 }
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
@@ -2820,8 +3537,8 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
   bits<4> RdHi;
   bits<4> Rm;
   bits<4> Rn;
-  let Inst{19-16} = RdLo;
-  let Inst{15-12} = RdHi;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
   let Inst{11-8}  = Rm;
   let Inst{3-0}   = Rn;
 }
@@ -2855,8 +3572,7 @@ def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 }
 
 def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm",
-               [/* For disassembly only; pattern left blank */]>,
+               IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
             Requires<[IsARM, HasV6]> {
   let Inst{15-12} = 0b1111;
 }
@@ -2869,8 +3585,7 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra",
-               [/* For disassembly only; pattern left blank */]>,
+               IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
@@ -2881,8 +3596,7 @@ def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
-               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra",
-               [/* For disassembly only; pattern left blank */]>,
+               IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
             Requires<[IsARM, HasV6]>;
 
 multiclass AI_smul<string opc, PatFrag opnode> {
@@ -2925,92 +3639,95 @@ multiclass AI_smul<string opc, PatFrag opnode> {
 
 
 multiclass AI_smla<string opc, PatFrag opnode> {
-  def BB : AMulxyIa<0b0001000, 0b00, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  let DecoderMethod = "DecodeSMLAInstruction" in {
+  def BB : AMulxyIa<0b0001000, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra,
-                               (opnode (sext_inreg GPR:$Rn, i16),
-                                       (sext_inreg GPR:$Rm, i16))))]>,
+              [(set GPRnopc:$Rd, (add GPR:$Ra,
+                               (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                       (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def BT : AMulxyIa<0b0001000, 0b10, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (opnode (sext_inreg GPR:$Rn, i16),
-                                                   (sra GPR:$Rm, (i32 16)))))]>,
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
+                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def TB : AMulxyIa<0b0001000, 0b01, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)),
-                                                (sext_inreg GPR:$Rm, i16))))]>,
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                          (sext_inreg GPRnopc:$Rm, i16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def TT : AMulxyIa<0b0001000, 0b11, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
-             [(set GPR:$Rd, (add GPR:$Ra, (opnode (sra GPR:$Rn, (i32 16)),
-                                                   (sra GPR:$Rm, (i32 16)))))]>,
+             [(set GPRnopc:$Rd,
+                   (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
+                                         (sra GPRnopc:$Rm, (i32 16)))))]>,
             Requires<[IsARM, HasV5TE]>;
 
-  def WB : AMulxyIa<0b0001001, 0b00, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn,
-                                      (sext_inreg GPR:$Rm, i16)), (i32 16))))]>,
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                  (sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>,
            Requires<[IsARM, HasV5TE]>;
 
-  def WT : AMulxyIa<0b0001001, 0b10, (outs GPR:$Rd),
-              (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
+              (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
-              [(set GPR:$Rd, (add GPR:$Ra, (sra (opnode GPR:$Rn,
-                                        (sra GPR:$Rm, (i32 16))), (i32 16))))]>,
+              [(set GPRnopc:$Rd,
+                 (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
+                                    (sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>,
             Requires<[IsARM, HasV5TE]>;
+  }
 }
 
 defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
-// Halfword multiply accumulate long: SMLAL<x><y> -- for disassembly only
-def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+// Halfword multiply accumulate long: SMLAL<x><y>.
+def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPR:$RdLo, GPR:$RdHi),
-                      (ins GPR:$Rn, GPR:$Rm),
-                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm",
-                      [/* For disassembly only; pattern left blank */]>,
+def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
               Requires<[IsARM, HasV5TE]>;
 
-// Helper class for AI_smld -- for disassembly only
+// Helper class for AI_smld.
 class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
                     InstrItinClass itin, string opc, string asm>
   : AI<oops, iops, MulFrm, itin, opc, asm, []>, Requires<[IsARM, HasV6]> {
   bits<4> Rn;
   bits<4> Rm;
-  let Inst{4}     = 1;
-  let Inst{5}     = swap;
-  let Inst{6}     = sub;
-  let Inst{7}     = 0;
-  let Inst{21-20} = 0b00;
-  let Inst{22}    = long;
   let Inst{27-23} = 0b01110;
+  let Inst{22}    = long;
+  let Inst{21-20} = 0b00;
   let Inst{11-8}  = Rm;
+  let Inst{7}     = 0;
+  let Inst{6}     = sub;
+  let Inst{5}     = swap;
+  let Inst{4}     = 1;
   let Inst{3-0}   = Rn;
 }
 class AMulDualI<bit long, bit sub, bit swap, dag oops, dag iops,
@@ -3024,6 +3741,8 @@ class AMulDualIa<bit long, bit sub, bit swap, dag oops, dag iops,
                 InstrItinClass itin, string opc, string asm>
   : AMulDualIbase<long, sub, swap, oops, iops, itin, opc, asm> {
   bits<4> Ra;
+  bits<4> Rd;
+  let Inst{19-16} = Rd;
   let Inst{15-12} = Ra;
 }
 class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
@@ -3037,18 +3756,20 @@ class AMulDualI64<bit long, bit sub, bit swap, dag oops, dag iops,
 
 multiclass AI_smld<bit sub, string opc> {
 
-  def D : AMulDualIa<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
                   NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
 
-  def DX: AMulDualIa<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
+  def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
                   NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
 
-  def LD: AMulDualI64<1, sub, 0, (outs GPR:$RdLo,GPR:$RdHi),
-                  (ins GPR:$Rn, GPR:$Rm), NoItinerary,
+  def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
                   !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
 
-  def LDX : AMulDualI64<1, sub, 1, (outs GPR:$RdLo,GPR:$RdHi),
-                  (ins GPR:$Rn, GPR:$Rm), NoItinerary,
+  def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+                  (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
                   !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
 
 }
@@ -3058,10 +3779,10 @@ defm SMLS : AI_smld<1, "smls">;
 
 multiclass AI_sdml<bit sub, string opc> {
 
-  def D : AMulDualI<0, sub, 0, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                    NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
-  def DX : AMulDualI<0, sub, 1, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
-                    NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+  def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+  def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
 }
 
 defm SMUA : AI_sdml<0, "smua">;
@@ -3100,55 +3821,38 @@ def : ARMV6Pat<(or (sra (shl GPR:$Rm, (i32 24)), (i32 16)),
                    (and (srl GPR:$Rm, (i32 8)), 0xFF)),
                (REVSH GPR:$Rm)>;
 
-def lsl_shift_imm : SDNodeXForm<imm, [{
-  unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::lsl, N->getZExtValue());
-  return CurDAG->getTargetConstant(Sh, MVT::i32);
-}]>;
-
-def lsl_amt : ImmLeaf<i32, [{
-  return Imm > 0 && Imm < 32;
-}], lsl_shift_imm>;
-
-def PKHBT : APKHI<0b01101000, 0, (outs GPR:$Rd),
-                              (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh),
+def PKHBT : APKHI<0b01101000, 0, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_lsl_amt:$sh),
                IIC_iALUsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
-               [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF),
-                                  (and (shl GPR:$Rm, lsl_amt:$sh),
-                                       0xFFFF0000)))]>,
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF),
+                                      (and (shl GPRnopc:$Rm, pkh_lsl_amt:$sh),
+                                           0xFFFF0000)))]>,
                Requires<[IsARM, HasV6]>;
 
 // Alternate cases for PKHBT where identities eliminate some nodes.
-def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (and GPR:$Rm, 0xFFFF0000)),
-               (PKHBT GPR:$Rn, GPR:$Rm, 0)>;
-def : ARMV6Pat<(or (and GPR:$Rn, 0xFFFF), (shl GPR:$Rm, imm16_31:$sh)),
-               (PKHBT GPR:$Rn, GPR:$Rm, (lsl_shift_imm imm16_31:$sh))>;
-
-def asr_shift_imm : SDNodeXForm<imm, [{
-  unsigned Sh = ARM_AM::getSORegOpc(ARM_AM::asr, N->getZExtValue());
-  return CurDAG->getTargetConstant(Sh, MVT::i32);
-}]>;
-
-def asr_amt : ImmLeaf<i32, [{
-  return Imm > 0 && Imm <= 32;
-}], asr_shift_imm>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (and GPRnopc:$Rm, 0xFFFF0000)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(or (and GPRnopc:$Rn, 0xFFFF), (shl GPRnopc:$Rm, imm16_31:$sh)),
+               (PKHBT GPRnopc:$Rn, GPRnopc:$Rm, imm16_31:$sh)>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
-def PKHTB : APKHI<0b01101000, 1, (outs GPR:$Rd),
-                              (ins GPR:$Rn, GPR:$Rm, shift_imm:$sh),
+def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
+                              (ins GPRnopc:$Rn, GPRnopc:$Rm, pkh_asr_amt:$sh),
                IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
-               [(set GPR:$Rd, (or (and GPR:$Rn, 0xFFFF0000),
-                                  (and (sra GPR:$Rm, asr_amt:$sh),
-                                       0xFFFF)))]>,
+               [(set GPRnopc:$Rd, (or (and GPRnopc:$Rn, 0xFFFF0000),
+                                      (and (sra GPRnopc:$Rm, pkh_asr_amt:$sh),
+                                           0xFFFF)))]>,
                Requires<[IsARM, HasV6]>;
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, imm16_31:$sh)),
-               (PKHTB GPR:$src1, GPR:$src2, (asr_shift_imm imm16_31:$sh))>;
-def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
-                   (and (srl GPR:$src2, imm1_15:$sh), 0xFFFF)),
-               (PKHTB GPR:$src1, GPR:$src2, (asr_shift_imm imm1_15:$sh))>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (srl GPRnopc:$src2, imm16_31:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
 
 //===----------------------------------------------------------------------===//
 //  Comparison Instructions...
@@ -3163,8 +3867,10 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_imm:$imm),
              (CMPri   GPR:$src, so_imm:$imm)>;
 def : ARMPat<(ARMcmpZ GPR:$src, GPR:$rhs),
              (CMPrr   GPR:$src, GPR:$rhs)>;
-def : ARMPat<(ARMcmpZ GPR:$src, so_reg:$rhs),
-             (CMPrs   GPR:$src, so_reg:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_imm:$rhs),
+             (CMPrsi   GPR:$src, so_reg_imm:$rhs)>;
+def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
+             (CMPrsr   GPR:$src, so_reg_reg:$rhs)>;
 
 // FIXME: We have to be careful when using the CMN instruction and comparison
 // with 0. One would expect these two pieces of code should give identical
@@ -3250,15 +3956,23 @@ def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
                            4, IIC_iCMOVr,
   [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
       RegConstraint<"$false = $Rd">;
-def MOVCCs : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg:$shift, pred:$p),
+def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_reg_imm:$shift, pred:$p),
+                           4, IIC_iCMOVsr,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift,
+                            imm:$cc, CCR:$ccr))*/]>,
+      RegConstraint<"$false = $Rd">;
+def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, so_reg_reg:$shift, pred:$p),
                            4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg:$shift, imm:$cc, CCR:$ccr))*/]>,
+  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            imm:$cc, CCR:$ccr))*/]>,
       RegConstraint<"$false = $Rd">;
 
+
 let isMoveImm = 1 in
 def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
-                             (ins GPR:$false, i32imm_hilo16:$imm, pred:$p),
+                             (ins GPR:$false, imm0_65535_expr:$imm, pred:$p),
                              4, IIC_iMOVi,
                              []>,
       RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
@@ -3288,9 +4002,14 @@ def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
 // Atomic operations intrinsics
 //
 
+def MemBarrierOptOperand : AsmOperandClass {
+  let Name = "MemBarrierOpt";
+  let ParserMethod = "parseMemBarrierOptOperand";
+}
 def memb_opt : Operand<i32> {
   let PrintMethod = "printMemBOption";
   let ParserMatchClass = MemBarrierOptOperand;
+  let DecoderMethod = "DecodeMemBarrierOption";
 }
 
 // memory barriers protect the atomic sequences
@@ -3321,8 +4040,16 @@ def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{3-0} = opt;
 }
 
+// Pseudo isntruction that combines movs + predicated rsbmi
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def ABS : ARMPseudoInst<
+  (outs GPR:$dst), (ins GPR:$src),
+  8, NoItinerary, []>;
+}
+
 let usesCustomInserter = 1 in {
-  let Uses = [CPSR] in {
+  let Defs = [CPSR] in {
     def ATOMIC_LOAD_ADD_I8 : PseudoInst<
       (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
       [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
@@ -3437,44 +4164,47 @@ let usesCustomInserter = 1 in {
 }
 
 let mayLoad = 1 in {
-def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
+def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary,
                     "ldrexb", "\t$Rt, $addr", []>;
-def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
-                    "ldrexh", "\t$Rt, $addr", []>;
-def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addrmode7:$addr), NoItinerary,
-                    "ldrex", "\t$Rt, $addr", []>;
+def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrexh", "\t$Rt, $addr", []>;
+def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldrex", "\t$Rt, $addr", []>;
 let hasExtraDefRegAllocReq = 1 in
-  def LDREXD : AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode7:$addr),
-                      NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []>;
+def LDREXD: AIldrex<0b01, (outs GPR:$Rt, GPR:$Rt2),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldrexd", "\t$Rt, $Rt2, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def STREXB : AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>;
-def STREXH : AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
-def STREX  : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addrmode7:$addr),
+def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
 }
 
 let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
-                    (ins GPR:$Rt, GPR:$Rt2, addrmode7:$addr),
-                    NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []>;
+                    (ins GPR:$Rt, GPR:$Rt2, addr_offset_none:$addr),
+                    NoItinerary, "strexd", "\t$Rd, $Rt, $Rt2, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
 
-// Clear-Exclusive is for disassembly only.
-def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
-                [/* For disassembly only; pattern left blank */]>,
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
             Requires<[IsARM, HasV7]>  {
   let Inst{31-0} = 0b11110101011111111111000000011111;
 }
 
-// SWP/SWPB are deprecated in V6/V7 and for disassembly only.
-let mayLoad = 1 in {
-def SWP  : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swp",
-             [/* For disassembly only; pattern left blank */]>;
-def SWPB : AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, GPR:$Rn), "swpb",
-             [/* For disassembly only; pattern left blank */]>;
+// SWP/SWPB are deprecated in V6/V7.
+let mayLoad = 1, mayStore = 1 in {
+def SWP : AIswp<0, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr),
+                "swp", []>;
+def SWPB: AIswp<1, (outs GPR:$Rt), (ins GPR:$Rt2, addr_offset_none:$addr),
+                "swpb", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3526,108 +4256,171 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 
 class ACI<dag oops, dag iops, string opc, string asm,
           IndexMode im = IndexModeNone>
+  : I<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
+      opc, asm, "", []> {
+  let Inst{27-25} = 0b110;
+}
+class ACInoP<dag oops, dag iops, string opc, string asm,
+          IndexMode im = IndexModeNone>
   : InoP<oops, iops, AddrModeNone, 4, im, BrFrm, NoItinerary,
-         opc, asm, "", [/* For disassembly only; pattern left blank */]> {
+         opc, asm, "", []> {
+  let Inst{31-28} = 0b1111;
   let Inst{27-25} = 0b110;
 }
-
-multiclass LdStCop<bits<4> op31_28, bit load, dag ops, string opc, string cond>{
-
-  def _OFFSET : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr"> {
-    let Inst{31-28} = op31_28;
+multiclass LdStCop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def _PRE : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr!", IndexModePre> {
-    let Inst{31-28} = op31_28;
+  def _PRE : ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                 asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def _POST : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, $addr", IndexModePost> {
-    let Inst{31-28} = op31_28;
+  def _POST: ACI<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                              postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
   def _OPTION : ACI<(outs),
-      !con((ins nohash_imm:$cop,nohash_imm:$CRd,GPR:$base, nohash_imm:$option),
-            ops),
-      !strconcat(opc, cond), "\tp$cop, cr$CRd, [$base], \\{$option\\}"> {
-    let Inst{31-28} = op31_28;
+                    (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                         coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
     let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 0; // D = 0
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_OFFSET : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr"> {
-    let Inst{31-28} = op31_28;
+}
+multiclass LdSt2Cop<bit load, bit Dbit, string asm> {
+  def _OFFSET : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                       asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_PRE : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr!",
-      IndexModePre> {
-    let Inst{31-28} = op31_28;
+  def _PRE : ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                    asm, "\t$cop, $CRd, $addr!", IndexModePre> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_POST : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd, addrmode2:$addr), ops),
-      !strconcat(!strconcat(opc, "l"), cond), "\tp$cop, cr$CRd, $addr",
-      IndexModePost> {
-    let Inst{31-28} = op31_28;
+  def _POST: ACInoP<(outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                                 postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset", IndexModePost> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
     let Inst{21} = 1; // W = 1
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
-
-  def L_OPTION : ACI<(outs),
-      !con((ins nohash_imm:$cop, nohash_imm:$CRd,GPR:$base,nohash_imm:$option),
-            ops),
-      !strconcat(!strconcat(opc, "l"), cond),
-      "\tp$cop, cr$CRd, [$base], \\{$option\\}"> {
-    let Inst{31-28} = op31_28;
+  def _OPTION : ACInoP<(outs),
+                       (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                            coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
     let Inst{24} = 0; // P = 0
     let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
     let Inst{21} = 0; // W = 0
-    let Inst{22} = 1; // D = 1
     let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
   }
 }
 
-defm LDC  : LdStCop<{?,?,?,?}, 1, (ins pred:$p), "ldc",  "${p}">;
-defm LDC2 : LdStCop<0b1111,    1, (ins),         "ldc2", "">;
-defm STC  : LdStCop<{?,?,?,?}, 0, (ins pred:$p), "stc",  "${p}">;
-defm STC2 : LdStCop<0b1111,    0, (ins),         "stc2", "">;
+defm LDC   : LdStCop <1, 0, "ldc">;
+defm LDCL  : LdStCop <1, 1, "ldcl">;
+defm STC   : LdStCop <0, 0, "stc">;
+defm STCL  : LdStCop <0, 1, "stcl">;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2">;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l">;
+defm STC2  : LdSt2Cop<0, 0, "stc2">;
+defm STC2L : LdSt2Cop<0, 1, "stc2l">;
 
 //===----------------------------------------------------------------------===//
-// Move between coprocessor and ARM core register -- for disassembly only
+// Move between coprocessor and ARM core register.
 //
 
 class MovRCopro<string opc, bit direction, dag oops, dag iops,
@@ -3660,8 +4453,8 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
                                   imm:$CRm, imm:$opc2)]>;
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
                     (outs GPR:$Rt),
-                    (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
-                         i32imm:$opc2), []>;
+                    (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                         imm0_7:$opc2), []>;
 
 def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
              (MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
@@ -3697,15 +4490,14 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                                      imm:$CRm, imm:$opc2)]>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPR:$Rt),
-                      (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm,
-                           i32imm:$opc2), []>;
+                      (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
+                           imm0_7:$opc2), []>;
 
 def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
                               imm:$CRm, imm:$opc2),
                 (MRC2 imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
 
-class MovRRCopro<string opc, bit direction,
-                 list<dag> pattern = [/* For disassembly only */]>
+class MovRRCopro<string opc, bit direction, list<dag> pattern = []>
   : ABI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
         GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
         NoItinerary, opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
@@ -3730,8 +4522,7 @@ def MCRR : MovRRCopro<"mcrr", 0 /* from ARM core register to coprocessor */,
                                      imm:$CRm)]>;
 def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 
-class MovRRCopro2<string opc, bit direction,
-                  list<dag> pattern = [/* For disassembly only */]>
+class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
          GPR:$Rt, GPR:$Rt2, c_imm:$CRm), NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
@@ -3758,20 +4549,22 @@ def MCRR2 : MovRRCopro2<"mcrr2", 0 /* from ARM core register to coprocessor */,
 def MRRC2 : MovRRCopro2<"mrrc2", 1 /* from coprocessor to ARM core register */>;
 
 //===----------------------------------------------------------------------===//
-// Move between special register and ARM core register -- for disassembly only
+// Move between special register and ARM core register
 //
 
 // Move to ARM core register from Special Register
-def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr",
-              [/* For disassembly only; pattern left blank */]> {
+def MRS : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,
+              "mrs", "\t$Rd, apsr", []> {
   bits<4> Rd;
   let Inst{23-16} = 0b00001111;
   let Inst{15-12} = Rd;
   let Inst{7-4} = 0b0000;
 }
 
-def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,"mrs","\t$Rd, spsr",
-              [/* For disassembly only; pattern left blank */]> {
+def : InstAlias<"mrs${p} $Rd, cpsr", (MRS GPR:$Rd, pred:$p)>, Requires<[IsARM]>;
+
+def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,
+                 "mrs", "\t$Rd, spsr", []> {
   bits<4> Rd;
   let Inst{23-16} = 0b01001111;
   let Inst{15-12} = Rd;
@@ -3785,8 +4578,7 @@ def MRSsys : ABI<0b0001, (outs GPR:$Rd), (ins), NoItinerary,"mrs","\t$Rd, spsr",
 // operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
 // the mask with the fields to be accessed in the special register.
 def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
-              "msr", "\t$mask, $Rn",
-              [/* For disassembly only; pattern left blank */]> {
+              "msr", "\t$mask, $Rn", []> {
   bits<5> mask;
   bits<4> Rn;
 
@@ -3800,8 +4592,7 @@ def MSR : ABI<0b0001, (outs), (ins msr_mask:$mask, GPR:$Rn), NoItinerary,
 }
 
 def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
-               "msr", "\t$mask, $a",
-               [/* For disassembly only; pattern left blank */]> {
+               "msr", "\t$mask, $a", []> {
   bits<5> mask;
   bits<12> a;
 
@@ -4030,6 +4821,47 @@ def : ARMV5TEPat<(add GPR:$acc,
 def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
          Requires<[IsARM, HasV6]>;
 
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : ARMV6Pat<(and GPR:$Src, 0x000000FF), (UXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x0000FFFF), (UXTH GPR:$Src, 0)>;
+def : ARMV6Pat<(and GPR:$Src, 0x00FF00FF), (UXTB16 GPR:$Src, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0x00FF)),
+               (UXTAB GPR:$Rn, GPR:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (and GPR:$Rm, 0xFFFF)),
+               (UXTAH GPR:$Rn, GPR:$Rm, 0)>;
+}
+
+def : ARMV6Pat<(sext_inreg GPR:$Src, i8),  (SXTB GPR:$Src, 0)>;
+def : ARMV6Pat<(sext_inreg GPR:$Src, i16), (SXTH GPR:$Src, 0)>;
+
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i8)),
+               (SXTAB GPR:$Rn, GPRnopc:$Rm, 0)>;
+def : ARMV6Pat<(add GPR:$Rn, (sext_inreg GPRnopc:$Rm, i16)),
+               (SXTAH GPR:$Rn, GPRnopc:$Rm, 0)>;
+
+// Atomic load/store patterns
+def : ARMPat<(atomic_load_8 ldst_so_reg:$src),
+             (LDRBrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_8 addrmode_imm12:$src),
+             (LDRBi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_load_16 addrmode3:$src),
+             (LDRH addrmode3:$src)>;
+def : ARMPat<(atomic_load_32 ldst_so_reg:$src),
+             (LDRrs ldst_so_reg:$src)>;
+def : ARMPat<(atomic_load_32 addrmode_imm12:$src),
+             (LDRi12 addrmode_imm12:$src)>;
+def : ARMPat<(atomic_store_8 ldst_so_reg:$ptr, GPR:$val),
+             (STRBrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_8 addrmode_imm12:$ptr, GPR:$val),
+             (STRBi12 GPR:$val, addrmode_imm12:$ptr)>;
+def : ARMPat<(atomic_store_16 addrmode3:$ptr, GPR:$val),
+             (STRH GPR:$val, addrmode3:$ptr)>;
+def : ARMPat<(atomic_store_32 ldst_so_reg:$ptr, GPR:$val),
+             (STRrs GPR:$val, ldst_so_reg:$ptr)>;
+def : ARMPat<(atomic_store_32 addrmode_imm12:$ptr, GPR:$val),
+             (STRi12 GPR:$val, addrmode_imm12:$ptr)>;
+
 
 //===----------------------------------------------------------------------===//
 // Thumb Support
@@ -4070,7 +4902,103 @@ def : MnemonicAlias<"swi", "svc">;
 // Load / Store Multiple
 def : MnemonicAlias<"ldmfd", "ldm">;
 def : MnemonicAlias<"ldmia", "ldm">;
+def : MnemonicAlias<"ldmea", "ldmdb">;
 def : MnemonicAlias<"stmfd", "stmdb">;
 def : MnemonicAlias<"stmia", "stm">;
 def : MnemonicAlias<"stmea", "stm">;
 
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (PKHBT GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, 0, pred:$p)>,
+        Requires<[IsARM, HasV6]>;
+
+// PUSH/POP aliases for STM/LDM
+def : ARMInstAlias<"push${p} $regs", (STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : ARMInstAlias<"pop${p} $regs", (LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// SSAT/USAT optional shift operand.
+def : ARMInstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                (SSAT GPRnopc:$Rd, imm1_32:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+def : ARMInstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                (USAT GPRnopc:$Rd, imm0_31:$sat_imm, GPRnopc:$Rn, 0, pred:$p)>;
+
+
+// Extend instruction optional rotate operand.
+def : ARMInstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (SXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (SXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (SXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb${p} $Rd, $Rm",
+                (SXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxtb16${p} $Rd, $Rm",
+                (SXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"sxth${p} $Rd, $Rm",
+                (SXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+def : ARMInstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (UXTAB GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (UXTAH GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (UXTAB16 GPRnopc:$Rd, GPR:$Rn, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb${p} $Rd, $Rm",
+                (UXTB GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxtb16${p} $Rd, $Rm",
+                (UXTB16 GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+def : ARMInstAlias<"uxth${p} $Rd, $Rm",
+                (UXTH GPRnopc:$Rd, GPRnopc:$Rm, 0, pred:$p)>;
+
+
+// RFE aliases
+def : MnemonicAlias<"rfefa", "rfeda">;
+def : MnemonicAlias<"rfeea", "rfedb">;
+def : MnemonicAlias<"rfefd", "rfeia">;
+def : MnemonicAlias<"rfeed", "rfeib">;
+def : MnemonicAlias<"rfe", "rfeia">;
+
+// SRS aliases
+def : MnemonicAlias<"srsfa", "srsda">;
+def : MnemonicAlias<"srsea", "srsdb">;
+def : MnemonicAlias<"srsfd", "srsia">;
+def : MnemonicAlias<"srsed", "srsib">;
+def : MnemonicAlias<"srs", "srsia">;
+
+// QSAX == QSUBADDX
+def : MnemonicAlias<"qsubaddx", "qsax">;
+// SASX == SADDSUBX
+def : MnemonicAlias<"saddsubx", "sasx">;
+// SHASX == SHADDSUBX
+def : MnemonicAlias<"shaddsubx", "shasx">;
+// SHSAX == SHSUBADDX
+def : MnemonicAlias<"shsubaddx", "shsax">;
+// SSAX == SSUBADDX
+def : MnemonicAlias<"ssubaddx", "ssax">;
+// UASX == UADDSUBX
+def : MnemonicAlias<"uaddsubx", "uasx">;
+// UHASX == UHADDSUBX
+def : MnemonicAlias<"uhaddsubx", "uhasx">;
+// UHSAX == UHSUBADDX
+def : MnemonicAlias<"uhsubaddx", "uhsax">;
+// UQASX == UQADDSUBX
+def : MnemonicAlias<"uqaddsubx", "uqasx">;
+// UQSAX == UQSUBADDX
+def : MnemonicAlias<"uqsubaddx", "uqsax">;
+// USAX == USUBADDX
+def : MnemonicAlias<"usubaddx", "usax">;
+
+// LDRSBT/LDRHT/LDRSHT post-index offset if optional.
+// Note that the write-back output register is a dummy operand for MC (it's
+// only meaningful for codegen), so we just pass zero here.
+// FIXME: tblgen not cooperating with argument conversions.
+//def : InstAlias<"ldrsbt${p} $Rt, $addr",
+//                (LDRSBTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0,pred:$p)>;
+//def : InstAlias<"ldrht${p} $Rt, $addr",
+//                (LDRHTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0, pred:$p)>;
+//def : InstAlias<"ldrsht${p} $Rt, $addr",
+//                (LDRSHTi GPR:$Rt, GPR:$Rt, addr_offset_none:$addr, 0, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 0df62f4..7aad186 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -11,6 +11,35 @@
 //
 //===----------------------------------------------------------------------===//
 
+
+//===----------------------------------------------------------------------===//
+// NEON-specific Operands.
+//===----------------------------------------------------------------------===//
+def VectorIndex8Operand  : AsmOperandClass { let Name = "VectorIndex8"; }
+def VectorIndex16Operand : AsmOperandClass { let Name = "VectorIndex16"; }
+def VectorIndex32Operand : AsmOperandClass { let Name = "VectorIndex32"; }
+def VectorIndex8 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndex8Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex16 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndex16Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+def VectorIndex32 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndex32Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i32imm);
+}
+
 //===----------------------------------------------------------------------===//
 // NEON-specific DAG Nodes.
 //===----------------------------------------------------------------------===//
@@ -175,7 +204,8 @@ class VLDQQWBPseudo<InstrItinClass itin>
                 (ins addrmode6:$addr, am6offset:$offset), itin,
                 "$addr.addr = $wb">;
 class VLDQQQQPseudo<InstrItinClass itin>
-  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,"">;
+  : PseudoNLdSt<(outs QQQQPR:$dst), (ins addrmode6:$addr, QQQQPR:$src),itin,
+                "$src = $dst">;
 class VLDQQQQWBPseudo<InstrItinClass itin>
   : PseudoNLdSt<(outs QQQQPR:$dst, GPR:$wb),
                 (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), itin,
@@ -190,6 +220,7 @@ class VLD1D<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2),
@@ -197,6 +228,7 @@ class VLD1Q<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
@@ -221,6 +253,7 @@ class VLD1DWB<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1QWB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b1010,op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
@@ -228,6 +261,7 @@ class VLD1QWB<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD1d8_UPD  : VLD1DWB<{0,0,0,?}, "8">;
@@ -252,12 +286,14 @@ class VLD1D3<bits<4> op7_4, string Dt>
           "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0110,op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD1x3u, "vld1", Dt,
           "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD1d8T      : VLD1D3<{0,0,0,?}, "8">;
@@ -280,6 +316,7 @@ class VLD1D4<bits<4> op7_4, string Dt>
           "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b0010,op7_4,
@@ -288,6 +325,7 @@ class VLD1D4WB<bits<4> op7_4, string Dt>
           "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm", "$Rn.addr = $wb",
           []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8">;
@@ -310,6 +348,7 @@ class VLD2D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD2Q<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, 0b0011, op7_4,
@@ -318,6 +357,7 @@ class VLD2Q<bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD2d8   : VLD2D<0b1000, {0,0,?,?}, "8">;
@@ -343,6 +383,7 @@ class VLD2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 class VLD2QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, 0b0011, op7_4,
@@ -351,6 +392,7 @@ class VLD2QWB<bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD2d8_UPD  : VLD2DWB<0b1000, {0,0,?,?}, "8">;
@@ -384,6 +426,7 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
@@ -402,6 +445,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
@@ -441,6 +485,7 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
@@ -459,6 +504,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVLDInstruction";
 }
 
 def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
@@ -530,6 +576,7 @@ class VLD1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
                                          (i32 (LoadOp addrmode6:$Rn)),
                                          imm:$lane))]> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
 }
 class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
              PatFrag LoadOp>
@@ -541,6 +588,7 @@ class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
                                          (i32 (LoadOp addrmode6oneL32:$Rn)),
                                          imm:$lane))]> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD1LN";
 }
 class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
   let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
@@ -580,7 +628,9 @@ class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
-          "$src = $Vd, $Rn.addr = $wb", []>;
+          "$src = $Vd, $Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVLD1LN";
+}
 
 def VLD1LNd8_UPD  : VLD1LNWB<0b0000, {?,?,?,0}, "8"> {
   let Inst{7-5} = lane{2-0};
@@ -607,6 +657,7 @@ class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "$src1 = $Vd, $src2 = $dst2", []> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
 }
 
 def VLD2LNd8  : VLD2LN<0b0001, {?,?,?,?}, "8"> {
@@ -642,6 +693,7 @@ class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn$Rm",
           "$src1 = $Vd, $src2 = $dst2, $Rn.addr = $wb", []> {
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD2LN";
 }
 
 def VLD2LNd8_UPD  : VLD2LNWB<0b0001, {?,?,?,?}, "8"> {
@@ -676,6 +728,7 @@ class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVLD3LN";
 }
 
 def VLD3LNd8  : VLD3LN<0b0010, {?,?,?,0}, "8"> {
@@ -712,7 +765,9 @@ class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VLD3lnu, "vld3", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
-          []>;
+          []> {
+  let DecoderMethod = "DecodeVLD3LN";
+}
 
 def VLD3LNd8_UPD  : VLD3LNWB<0b0010, {?,?,?,0}, "8"> {
   let Inst{7-5} = lane{2-0};
@@ -748,6 +803,7 @@ class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN";
 }
 
 def VLD4LNd8  : VLD4LN<0b0011, {?,?,?,?}, "8"> {
@@ -788,6 +844,7 @@ class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
 "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4, $Rn.addr = $wb",
           []> {
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVLD4LN"  ;
 }
 
 def VLD4LNd8_UPD  : VLD4LNWB<0b0011, {?,?,?,?}, "8"> {
@@ -825,6 +882,7 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
           [(set DPR:$Vd, (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 class VLD1QDUPPseudo<ValueType Ty, PatFrag LoadOp> : VLDQPseudo<IIC_VLD1dup> {
   let Pattern = [(set QPR:$dst,
@@ -852,6 +910,7 @@ class VLD1QDUP<bits<4> op7_4, string Dt>
           "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 
 def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8">;
@@ -864,12 +923,14 @@ class VLD1DUPWB<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
           "vld1", Dt, "\\{$Vd[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 class VLD1QDUPWB<bits<4> op7_4, string Dt>
   : NLdSt<1, 0b10, 0b1100, op7_4, (outs DPR:$Vd, DPR:$dst2, GPR:$wb),
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD1dupu,
           "vld1", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 
 def VLD1DUPd8_UPD  : VLD1DUPWB<{0,0,0,0}, "8">;
@@ -891,6 +952,7 @@ class VLD2DUP<bits<4> op7_4, string Dt>
           "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD2DupInstruction";
 }
 
 def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8">;
@@ -912,6 +974,7 @@ class VLD2DUPWB<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD2dupu,
           "vld2", Dt, "\\{$Vd[], $dst2[]\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD2DupInstruction";
 }
 
 def VLD2DUPd8_UPD  : VLD2DUPWB<{0,0,0,0}, "8">;
@@ -932,7 +995,8 @@ class VLD3DUP<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn), IIC_VLD3dup,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
-  let Inst{4} = Rn{4};
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
 }
 
 def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
@@ -954,7 +1018,8 @@ class VLD3DUPWB<bits<4> op7_4, string Dt>
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
-  let Inst{4} = Rn{4};
+  let Inst{4} = 0;
+  let DecoderMethod = "DecodeVLD3DupInstruction";
 }
 
 def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8">;
@@ -977,6 +1042,7 @@ class VLD4DUP<bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
 }
 
 def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
@@ -1000,6 +1066,7 @@ class VLD4DUPWB<bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVLD4DupInstruction";
 }
 
 def VLD4DUPd8_UPD  : VLD4DUPWB<{0,0,0,0}, "8">;
@@ -1045,6 +1112,7 @@ class VST1D<bits<4> op7_4, string Dt>
           IIC_VST1, "vst1", Dt, "\\{$Vd\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b00,0b1010,op7_4, (outs),
@@ -1052,6 +1120,7 @@ class VST1Q<bits<4> op7_4, string Dt>
           "vst1", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST1d8   : VST1D<{0,0,0,?}, "8">;
@@ -1075,6 +1144,7 @@ class VST1DWB<bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm, DPR:$Vd), IIC_VST1u,
           "vst1", Dt, "\\{$Vd\\}, $Rn$Rm", "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b1010, op7_4, (outs GPR:$wb),
@@ -1082,6 +1152,7 @@ class VST1QWB<bits<4> op7_4, string Dt>
           IIC_VST1x2u, "vst1", Dt, "\\{$Vd, $src2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST1d8_UPD  : VST1DWB<{0,0,0,?}, "8">;
@@ -1106,6 +1177,7 @@ class VST1D3<bits<4> op7_4, string Dt>
           IIC_VST1x3, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1D3WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs GPR:$wb),
@@ -1114,6 +1186,7 @@ class VST1D3WB<bits<4> op7_4, string Dt>
           IIC_VST1x3u, "vst1", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST1d8T      : VST1D3<{0,0,0,?}, "8">;
@@ -1137,6 +1210,7 @@ class VST1D4<bits<4> op7_4, string Dt>
           []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST1D4WB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs GPR:$wb),
@@ -1145,6 +1219,7 @@ class VST1D4WB<bits<4> op7_4, string Dt>
           "vst1", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST1d8Q      : VST1D4<{0,0,?,?}, "8">;
@@ -1167,6 +1242,7 @@ class VST2D<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VST2, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST2Q<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0011, op7_4, (outs),
@@ -1175,6 +1251,7 @@ class VST2Q<bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST2d8   : VST2D<0b1000, {0,0,?,?}, "8">;
@@ -1200,6 +1277,7 @@ class VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VST2u, "vst2", Dt, "\\{$Vd, $src2\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 class VST2QWB<bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
@@ -1208,6 +1286,7 @@ class VST2QWB<bits<4> op7_4, string Dt>
           "vst2", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST2d8_UPD  : VST2DWB<0b1000, {0,0,?,?}, "8">;
@@ -1241,6 +1320,7 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
@@ -1259,6 +1339,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
@@ -1298,6 +1379,7 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
@@ -1316,6 +1398,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
+  let DecoderMethod = "DecodeVSTInstruction";
 }
 
 def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
@@ -1381,6 +1464,7 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
           [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6:$Rn)]> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST1LN";
 }
 class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
              PatFrag StoreOp, SDNode ExtractOp>
@@ -1389,6 +1473,7 @@ class VST1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
           [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), addrmode6oneL32:$Rn)]>{
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST1LN";
 }
 class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
   : VSTQLNPseudo<IIC_VST1ln> {
@@ -1429,7 +1514,9 @@ class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
           "$Rn.addr = $wb",
           [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
-                                  addrmode6:$Rn, am6offset:$Rm))]>;
+                                  addrmode6:$Rn, am6offset:$Rm))]> {
+  let DecoderMethod = "DecodeVST1LN";
+}
 class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
   : VSTQLNWBPseudo<IIC_VST1lnu> {
   let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
@@ -1465,6 +1552,7 @@ class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
 }
 
 def VST2LNd8  : VST2LN<0b0001, {?,?,?,?}, "8"> {
@@ -1502,6 +1590,7 @@ class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "\\{$src1[$lane], $src2[$lane]\\}, $addr$offset",
           "$addr.addr = $wb", []> {
   let Inst{4}   = Rn{4};
+  let DecoderMethod = "DecodeVST2LN";
 }
 
 def VST2LNd8_UPD  : VST2LNWB<0b0001, {?,?,?,?}, "8"> {
@@ -1535,6 +1624,7 @@ class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
            nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
           "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
   let Rm = 0b1111;
+  let DecoderMethod = "DecodeVST3LN";
 }
 
 def VST3LNd8  : VST3LN<0b0010, {?,?,?,0}, "8"> {
@@ -1569,7 +1659,9 @@ class VST3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            DPR:$Vd, DPR:$src2, DPR:$src3, nohash_imm:$lane),
           IIC_VST3lnu, "vst3", Dt,
           "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []>;
+          "$Rn.addr = $wb", []> {
+  let DecoderMethod = "DecodeVST3LN";
+}
 
 def VST3LNd8_UPD  : VST3LNWB<0b0010, {?,?,?,0}, "8"> {
   let Inst{7-5} = lane{2-0};
@@ -1604,6 +1696,7 @@ class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
 }
 
 def VST4LNd8  : VST4LN<0b0011, {?,?,?,?}, "8"> {
@@ -1642,6 +1735,7 @@ class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
   "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
+  let DecoderMethod = "DecodeVST4LN";
 }
 
 def VST4LNd8_UPD  : VST4LNWB<0b0011, {?,?,?,?}, "8"> {
@@ -4039,6 +4133,7 @@ class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
   : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
            ResTy, OpTy, OpNode> {
   let Inst{21-16} = op21_16;
+  let DecoderMethod = "DecodeVSHLMaxInstruction";
 }
 def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
                           v8i16, v8i8, NEONvshlli>;
@@ -4219,16 +4314,6 @@ def : InstAlias<"vmov${p} $Vd, $Vm",
 def : InstAlias<"vmov${p} $Vd, $Vm",
                 (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
-let neverHasSideEffects = 1 in {
-// Pseudo vector move instructions for QQ and QQQQ registers. This should
-// be expanded after register allocation is completed.
-def  VMOVQQ   : PseudoInst<(outs QQPR:$dst), (ins QQPR:$src),
-                NoItinerary, []>;
-
-def  VMOVQQQQ : PseudoInst<(outs QQQQPR:$dst), (ins QQQQPR:$src),
-                NoItinerary, []>;
-} // neverHasSideEffects
-
 //   VMOV     : Vector Move (Immediate)
 
 let isReMaterializable = 1 in {
@@ -4462,36 +4547,42 @@ def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
 class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
-              ValueType Ty>
-  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane),
-              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm[$lane]",
+              ValueType Ty, Operand IdxTy>
+  : NVDupLane<op19_16, 0, (outs DPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVD, OpcodeStr, Dt, "$Vd, $Vm$lane",
               [(set DPR:$Vd, (Ty (NEONvduplane (Ty DPR:$Vm), imm:$lane)))]>;
 
 class VDUPLNQ<bits<4> op19_16, string OpcodeStr, string Dt,
-              ValueType ResTy, ValueType OpTy>
-  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, nohash_imm:$lane),
-              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm[$lane]",
+              ValueType ResTy, ValueType OpTy, Operand IdxTy>
+  : NVDupLane<op19_16, 1, (outs QPR:$Vd), (ins DPR:$Vm, IdxTy:$lane),
+              IIC_VMOVQ, OpcodeStr, Dt, "$Vd, $Vm$lane",
               [(set QPR:$Vd, (ResTy (NEONvduplane (OpTy DPR:$Vm),
-                                      imm:$lane)))]>;
+                                      VectorIndex32:$lane)))]>;
 
 // Inst{19-16} is partially specified depending on the element size.
 
-def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8> {
+def VDUPLN8d  : VDUPLND<{?,?,?,1}, "vdup", "8", v8i8, VectorIndex8> {
+  bits<3> lane;
   let Inst{19-17} = lane{2-0};
 }
-def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16> {
+def VDUPLN16d : VDUPLND<{?,?,1,0}, "vdup", "16", v4i16, VectorIndex16> {
+  bits<2> lane;
   let Inst{19-18} = lane{1-0};
 }
-def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32> {
+def VDUPLN32d : VDUPLND<{?,1,0,0}, "vdup", "32", v2i32, VectorIndex32> {
+  bits<1> lane;
   let Inst{19} = lane{0};
 }
-def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8> {
+def VDUPLN8q  : VDUPLNQ<{?,?,?,1}, "vdup", "8", v16i8, v8i8, VectorIndex8> {
+  bits<3> lane;
   let Inst{19-17} = lane{2-0};
 }
-def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16> {
+def VDUPLN16q : VDUPLNQ<{?,?,1,0}, "vdup", "16", v8i16, v4i16, VectorIndex16> {
+  bits<2> lane;
   let Inst{19-18} = lane{1-0};
 }
-def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32> {
+def VDUPLN32q : VDUPLNQ<{?,1,0,0}, "vdup", "32", v4i32, v2i32, VectorIndex32> {
+  bits<1> lane;
   let Inst{19} = lane{0};
 }
 
@@ -4753,6 +4844,7 @@ def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
 // Vector Table Lookup and Table Extension.
 
 //   VTBL     : Vector Table Lookup
+let DecoderMethod = "DecodeTBLInstruction" in {
 def  VTBL1
   : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$Vd),
         (ins DPR:$Vn, DPR:$Vm), NVTBLFrm, IIC_VTB1,
@@ -4815,6 +4907,7 @@ def  VTBX3Pseudo
 def  VTBX4Pseudo
   : PseudoNeonI<(outs DPR:$dst), (ins DPR:$orig, QQPR:$tbl, DPR:$src),
                 IIC_VTBX4, "$orig = $dst", []>;
+} // DecoderMethod = "DecodeTBLInstruction"
 
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index bfe83ec..cedb547 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -19,6 +19,19 @@ def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
 
+def imm_sr_XFORM: SDNodeXForm<imm, [{
+  unsigned Imm = N->getZExtValue();
+  return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), MVT::i32);
+}]>;
+def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def imm_sr : Operand<i32>, PatLeaf<(imm), [{
+  uint64_t Imm = N->getZExtValue();
+  return Imm > 0 && Imm <= 32;
+}], imm_sr_XFORM> {
+  let PrintMethod = "printThumbSRImm";
+  let ParserMatchClass = ThumbSRImmAsmOperand;
+}
+
 def imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
 }]>;
@@ -30,10 +43,6 @@ def imm0_7_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)-N->getZExtValue() < 8;
 }], imm_neg_XFORM>;
 
-def imm0_255_asmoperand : AsmOperandClass { let Name = "Imm0_255"; }
-def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
-  let ParserMatchClass = imm0_255_asmoperand;
-}
 def imm0_255_comp : PatLeaf<(i32 imm), [{
   return ~((uint32_t)N->getZExtValue()) < 256;
 }]>;
@@ -69,8 +78,17 @@ def t_adrlabel : Operand<i32> {
 }
 
 // Scaled 4 immediate.
-def t_imm_s4 : Operand<i32> {
+def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
+def t_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_1020s4_asmoperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
+
+def t_imm0_508s4_asmoperand: AsmOperandClass { let Name = "Imm0_508s4"; }
+def t_imm0_508s4 : Operand<i32> {
   let PrintMethod = "printThumbS4ImmOperand";
+  let ParserMatchClass = t_imm0_508s4_asmoperand;
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
@@ -79,113 +97,129 @@ def t_imm_s4 : Operand<i32> {
 let OperandType = "OPERAND_PCREL" in {
 def t_brtarget : Operand<OtherVT> {
   let EncoderMethod = "getThumbBRTargetOpValue";
+  let DecoderMethod = "DecodeThumbBROperand";
 }
 
 def t_bcctarget : Operand<i32> {
   let EncoderMethod = "getThumbBCCTargetOpValue";
+  let DecoderMethod = "DecodeThumbBCCTargetOperand";
 }
 
 def t_cbtarget : Operand<i32> {
   let EncoderMethod = "getThumbCBTargetOpValue";
+  let DecoderMethod = "DecodeThumbCmpBROperand";
 }
 
 def t_bltarget : Operand<i32> {
   let EncoderMethod = "getThumbBLTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLTargetOperand";
 }
 
 def t_blxtarget : Operand<i32> {
   let EncoderMethod = "getThumbBLXTargetOpValue";
+  let DecoderMethod = "DecodeThumbBLXOffset";
 }
 }
 
-def MemModeRegThumbAsmOperand : AsmOperandClass {
-  let Name = "MemModeRegThumb";
-  let SuperClasses = [];
-}
-
-def MemModeImmThumbAsmOperand : AsmOperandClass {
-  let Name = "MemModeImmThumb";
-  let SuperClasses = [];
-}
-
 // t_addrmode_rr := reg + reg
 //
+def t_addrmode_rr_asm_operand : AsmOperandClass { let Name = "MemThumbRR"; }
 def t_addrmode_rr : Operand<i32>,
                     ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
 }
 
 // t_addrmode_rrs := reg + reg
 //
+// We use separate scaled versions because the Select* functions need
+// to explicitly check for a matching constant and return false here so that
+// the reg+imm forms will match instead. This is a horrible way to do that,
+// as it forces tight coupling between the methods, but it's how selectiondag
+// currently works.
 def t_addrmode_rrs1 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S1", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
-  let ParserMatchClass = MemModeRegThumbAsmOperand;
 }
 def t_addrmode_rrs2 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S2", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
-  let ParserMatchClass = MemModeRegThumbAsmOperand;
 }
 def t_addrmode_rrs4 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectThumbAddrModeRI5S4", []> {
   let EncoderMethod = "getThumbAddrModeRegRegOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeRR";
   let PrintMethod = "printThumbAddrModeRROperand";
+  let ParserMatchClass = t_addrmode_rr_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
-  let ParserMatchClass = MemModeRegThumbAsmOperand;
 }
 
 // t_addrmode_is4 := reg + imm5 * 4
 //
+def t_addrmode_is4_asm_operand : AsmOperandClass { let Name = "MemThumbRIs4"; }
 def t_addrmode_is4 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S4", []> {
   let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
   let PrintMethod = "printThumbAddrModeImm5S4Operand";
+  let ParserMatchClass = t_addrmode_is4_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_is2 := reg + imm5 * 2
 //
+def t_addrmode_is2_asm_operand : AsmOperandClass { let Name = "MemThumbRIs2"; }
 def t_addrmode_is2 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S2", []> {
   let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
   let PrintMethod = "printThumbAddrModeImm5S2Operand";
+  let ParserMatchClass = t_addrmode_is2_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_is1 := reg + imm5
 //
+def t_addrmode_is1_asm_operand : AsmOperandClass { let Name = "MemThumbRIs1"; }
 def t_addrmode_is1 : Operand<i32>,
                      ComplexPattern<i32, 2, "SelectThumbAddrModeImm5S1", []> {
   let EncoderMethod = "getAddrModeISOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeIS";
   let PrintMethod = "printThumbAddrModeImm5S1Operand";
+  let ParserMatchClass = t_addrmode_is1_asm_operand;
   let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_sp := sp + imm8 * 4
 //
+// FIXME: This really shouldn't have an explicit SP operand at all. It should
+// be implicit, just like in the instruction encoding itself.
+def t_addrmode_sp_asm_operand : AsmOperandClass { let Name = "MemThumbSPI"; }
 def t_addrmode_sp : Operand<i32>,
                     ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
   let EncoderMethod = "getAddrModeThumbSPOpValue";
+  let DecoderMethod = "DecodeThumbAddrModeSP";
   let PrintMethod = "printThumbAddrModeSPOperand";
+  let ParserMatchClass = t_addrmode_sp_asm_operand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
 }
 
 // t_addrmode_pc := <label> => pc + imm8 * 4
 //
 def t_addrmode_pc : Operand<i32> {
   let EncoderMethod = "getAddrModePCOpValue";
-  let ParserMatchClass = MemModeImmThumbAsmOperand;
+  let DecoderMethod = "DecodeThumbAddrModePC";
 }
 
 //===----------------------------------------------------------------------===//
@@ -207,68 +241,52 @@ def tADJCALLSTACKDOWN :
             Requires<[IsThumb, IsThumb1Only]>;
 }
 
-// T1Disassembly - A simple class to make encoding some disassembly patterns
-// easier and less verbose.
-class T1Disassembly<bits<2> op1, bits<8> op2>
+class T1SystemEncoding<bits<8> opc>
   : T1Encoding<0b101111> {
-  let Inst{9-8} = op1;
-  let Inst{7-0} = op2;
+  let Inst{9-8} = 0b11;
+  let Inst{7-0} = opc;
 }
 
-def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x00>; // A8.6.110
+def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
+           T1SystemEncoding<0x00>, // A8.6.110
+        Requires<[IsThumb2]>;
 
-def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "",
-                  [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x10>; // A8.6.410
+def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
+           T1SystemEncoding<0x10>; // A8.6.410
 
-def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x20>; // A8.6.408
+def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
+           T1SystemEncoding<0x20>; // A8.6.408
 
-def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x30>; // A8.6.409
+def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
+           T1SystemEncoding<0x30>; // A8.6.409
 
-def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b11, 0x40>; // A8.6.157
+def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
+           T1SystemEncoding<0x40>; // A8.6.157
 
-// The i32imm operand $val can be used by a debugger to store more information
+// The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
-def tBKPT : T1I<(outs), (ins i32imm:$val), NoItinerary, "bkpt\t$val",
-                [/* For disassembly only; pattern left blank */]>,
-           T1Disassembly<0b10, {?,?,?,?,?,?,?,?}> {
+def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
+                []>,
+           T1Encoding<0b101111> {
+  let Inst{9-8} = 0b10;
   // A8.6.22
   bits<8> val;
   let Inst{7-0} = val;
 }
 
-def tSETENDBE : T1I<(outs), (ins), NoItinerary, "setend\tbe",
-                    [/* For disassembly only; pattern left blank */]>,
-                T1Encoding<0b101101> {
-  // A8.6.156
-  let Inst{9-5} = 0b10010;
-  let Inst{4}   = 1;
-  let Inst{3}   = 1;            // Big-Endian
-  let Inst{2-0} = 0b000;
-}
-
-def tSETENDLE : T1I<(outs), (ins), NoItinerary, "setend\tle",
-                    [/* For disassembly only; pattern left blank */]>,
-                T1Encoding<0b101101> {
+def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
+                  []>, T1Encoding<0b101101> {
+  bits<1> end;
   // A8.6.156
   let Inst{9-5} = 0b10010;
   let Inst{4}   = 1;
-  let Inst{3}   = 0;            // Little-Endian
+  let Inst{3}   = end;
   let Inst{2-0} = 0b000;
 }
 
 // Change Processor State is a system instruction -- for disassembly only.
 def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
-                NoItinerary, "cps$imod $iflags",
-                [/* For disassembly only; pattern left blank */]>,
+                NoItinerary, "cps$imod $iflags", []>,
            T1Misc<0b0110011> {
   // A8.6.38 & B6.1.1
   bit imod;
@@ -277,6 +295,7 @@ def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
   let Inst{4}   = imod;
   let Inst{3}   = 0;
   let Inst{2-0} = iflags;
+  let DecoderMethod = "DecodeThumbCPS";
 }
 
 // For both thumb1 and thumb2.
@@ -290,70 +309,70 @@ def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
   let Inst{2-0} = dst;
 }
 
-// PC relative add (ADR).
-def tADDrPCi : T1I<(outs tGPR:$dst), (ins t_imm_s4:$rhs), IIC_iALUi,
-                   "add\t$dst, pc, $rhs", []>,
-               T1Encoding<{1,0,1,0,0,?}> {
-  // A6.2 & A8.6.10
-  bits<3> dst;
-  bits<8> rhs;
-  let Inst{10-8} = dst;
-  let Inst{7-0}  = rhs;
-}
-
 // ADD <Rd>, sp, #<imm8>
-// This is rematerializable, which is particularly useful for taking the
-// address of locals.
-let isReMaterializable = 1 in
-def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, t_imm_s4:$rhs), IIC_iALUi,
-                   "add\t$dst, $sp, $rhs", []>,
+// FIXME: This should not be marked as having side effects, and it should be
+// rematerializable. Clearing the side effect bit causes miscompilations,
+// probably because the instruction can be moved around.
+def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
+                    IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
                T1Encoding<{1,0,1,0,1,?}> {
   // A6.2 & A8.6.8
   bits<3> dst;
-  bits<8> rhs;
+  bits<8> imm;
   let Inst{10-8} = dst;
-  let Inst{7-0}  = rhs;
+  let Inst{7-0}  = imm;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
 }
 
 // ADD sp, sp, #<imm7>
-def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
-                  "add\t$dst, $rhs", []>,
+def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                     IIC_iALUi, "add", "\t$Rdn, $imm", []>,
               T1Misc<{0,0,0,0,0,?,?}> {
   // A6.2.5 & A8.6.8
-  bits<7> rhs;
-  let Inst{6-0} = rhs;
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
 // SUB sp, sp, #<imm7>
 // FIXME: The encoding and the ASM string don't match up.
-def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
-                  "sub\t$dst, $rhs", []>,
+def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
+                    IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
               T1Misc<{0,0,0,0,1,?,?}> {
   // A6.2.5 & A8.6.214
-  bits<7> rhs;
-  let Inst{6-0} = rhs;
+  bits<7> imm;
+  let Inst{6-0} = imm;
+  let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
+// Can optionally specify SP as a three operand instruction.
+def : tInstAlias<"add${p} sp, sp, $imm",
+                 (tADDspi SP, t_imm0_508s4:$imm, pred:$p)>;
+def : tInstAlias<"sub${p} sp, sp, $imm",
+                 (tSUBspi SP, t_imm0_508s4:$imm, pred:$p)>;
+
 // ADD <Rm>, sp
-def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
-                  "add\t$dst, $rhs", []>,
+def tADDrSP : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPRsp:$sp), IIC_iALUr,
+                  "add", "\t$Rdn, $sp, $Rn", []>,
               T1Special<{0,0,?,?}> {
   // A8.6.9 Encoding T1
-  bits<4> dst;
-  let Inst{7}   = dst{3};
+  bits<4> Rdn;
+  let Inst{7}   = Rdn{3};
   let Inst{6-3} = 0b1101;
-  let Inst{2-0} = dst{2-0};
+  let Inst{2-0} = Rdn{2-0};
+  let DecoderMethod = "DecodeThumbAddSPReg";
 }
 
 // ADD sp, <Rm>
-def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
-                  "add\t$dst, $rhs", []>,
+def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
+                  "add", "\t$Rdn, $Rm", []>,
               T1Special<{0,0,?,?}> {
   // A8.6.9 Encoding T2
-  bits<4> dst;
+  bits<4> Rm;
   let Inst{7} = 1;
-  let Inst{6-3} = dst;
+  let Inst{6-3} = Rm;
   let Inst{2-0} = 0b101;
+  let DecoderMethod = "DecodeThumbAddSPReg";
 }
 
 //===----------------------------------------------------------------------===//
@@ -390,11 +409,12 @@ let isCall = 1,
   Uses = [SP] in {
   // Also used for Thumb2
   def tBL  : TIx2<0b11110, 0b11, 1,
-                  (outs), (ins t_bltarget:$func, variable_ops), IIC_Br,
-                  "bl\t$func",
+                  (outs), (ins pred:$p, t_bltarget:$func, variable_ops), IIC_Br,
+                  "bl${p}\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
              Requires<[IsThumb, IsNotDarwin]> {
-    bits<21> func;
+    bits<22> func;
+    let Inst{26} = func{21};
     let Inst{25-16} = func{20-11};
     let Inst{13} = 1;
     let Inst{11} = 1;
@@ -403,8 +423,8 @@ let isCall = 1,
 
   // ARMv5T and above, also used for Thumb2
   def tBLXi : TIx2<0b11110, 0b11, 0,
-                   (outs), (ins t_blxtarget:$func, variable_ops), IIC_Br,
-                   "blx\t$func",
+                 (outs), (ins pred:$p, t_blxtarget:$func, variable_ops), IIC_Br,
+                   "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]> {
     bits<21> func;
@@ -416,8 +436,8 @@ let isCall = 1,
   }
 
   // Also used for Thumb2
-  def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br,
-                  "blx\t$func",
+  def tBLXr : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
+                  "blx${p}\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T, IsNotDarwin]>,
               T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
@@ -440,43 +460,22 @@ let isCall = 1,
   Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR, QQQQ0, QQQQ2, QQQQ3, CPSR, FPSCR],
   Uses = [R7, SP] in {
   // Also used for Thumb2
-  def tBLr9 : TIx2<0b11110, 0b11, 1,
-                   (outs), (ins pred:$p, t_bltarget:$func, variable_ops),
-                   IIC_Br, "bl${p}\t$func",
-                   [(ARMtcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, IsDarwin]> {
-    bits<21> func;
-    let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
-    let Inst{10-0} = func{10-0};
-  }
+  def tBLr9 : tPseudoExpand<(outs), (ins pred:$p, t_bltarget:$func, variable_ops),
+                          4, IIC_Br, [(ARMtcall tglobaladdr:$func)],
+                          (tBL pred:$p, t_bltarget:$func)>,
+              Requires<[IsThumb, IsDarwin]>;
 
   // ARMv5T and above, also used for Thumb2
-  def tBLXi_r9 : TIx2<0b11110, 0b11, 0,
-                      (outs), (ins pred:$p, t_blxtarget:$func, variable_ops),
-                      IIC_Br, "blx${p}\t$func",
-                      [(ARMcall tglobaladdr:$func)]>,
-                 Requires<[IsThumb, HasV5T, IsDarwin]> {
-    bits<21> func;
-    let Inst{25-16} = func{20-11};
-    let Inst{13} = 1;
-    let Inst{11} = 1;
-    let Inst{10-1} = func{10-1};
-    let Inst{0} = 0; // func{0} is assumed zero
-  }
+  def tBLXi_r9 : tPseudoExpand<(outs), (ins pred:$p, t_blxtarget:$func, variable_ops),
+                      4, IIC_Br, [(ARMcall tglobaladdr:$func)],
+                      (tBLXi pred:$p, t_blxtarget:$func)>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>;
 
   // Also used for Thumb2
-  def tBLXr_r9 : TI<(outs), (ins pred:$p, GPR:$func, variable_ops), IIC_Br,
-                    "blx${p}\t$func",
-                    [(ARMtcall GPR:$func)]>,
-                 Requires<[IsThumb, HasV5T, IsDarwin]>,
-                 T1Special<{1,1,1,?}> {
-    // A6.2.3 & A8.6.24
-    bits<4> func;
-    let Inst{6-3} = func;
-    let Inst{2-0} = 0b000;
-  }
+  def tBLXr_r9 : tPseudoExpand<(outs), (ins pred:$p, GPR:$func, variable_ops),
+                    2, IIC_Br, [(ARMtcall GPR:$func)],
+                    (tBLXr pred:$p, GPR:$func)>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>;
 
   // ARMv4T
   def tBXr9_CALL : tPseudoInst<(outs), (ins tGPR:$func, variable_ops),
@@ -487,8 +486,8 @@ let isCall = 1,
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isPredicable = 1 in
-  def tB   : T1I<(outs), (ins t_brtarget:$target), IIC_Br,
-                 "b\t$target", [(br bb:$target)]>,
+  def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
+                 "b", "\t$target", [(br bb:$target)]>,
              T1Encoding<{1,1,1,0,0,?}> {
     bits<11> target;
     let Inst{10-0} = target;
@@ -498,8 +497,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   // Just a pseudo for a tBL instruction. Needed to let regalloc know about
   // the clobber of LR.
   let Defs = [LR] in
-  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target),
-                          4, IIC_Br, [], (tBL t_bltarget:$target)>;
+  def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
+                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>;
 
   def tBR_JTr : tPseudoInst<(outs),
                       (ins tGPR:$target, i32imm:$jt, i32imm:$id),
@@ -522,31 +521,6 @@ let isBranch = 1, isTerminator = 1 in
   let Inst{7-0} = target;
 }
 
-// Compare and branch on zero / non-zero
-let isBranch = 1, isTerminator = 1 in {
-  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
-                  "cbz\t$Rn, $target", []>,
-              T1Misc<{0,0,?,1,?,?,?}> {
-    // A8.6.27
-    bits<6> target;
-    bits<3> Rn;
-    let Inst{9}   = target{5};
-    let Inst{7-3} = target{4-0};
-    let Inst{2-0} = Rn;
-  }
-
-  def tCBNZ : T1I<(outs), (ins tGPR:$cmp, t_cbtarget:$target), IIC_Br,
-                  "cbnz\t$cmp, $target", []>,
-              T1Misc<{1,0,?,1,?,?,?}> {
-    // A8.6.27
-    bits<6> target;
-    bits<3> Rn;
-    let Inst{9}   = target{5};
-    let Inst{7-3} = target{4-0};
-    let Inst{2-0} = Rn;
-  }
-}
-
 // Tail calls
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin versions.
@@ -562,9 +536,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Non-Darwin versions (the difference is R9).
   let Defs = [R0, R1, R2, R3, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in {
-    def tTAILJMPdND : tPseudoExpand<(outs), (ins t_brtarget:$dst, variable_ops),
+    def tTAILJMPdND : tPseudoExpand<(outs),
+                   (ins t_brtarget:$dst, pred:$p, variable_ops),
                    4, IIC_Br, [],
-                   (tB t_brtarget:$dst)>,
+                   (tB t_brtarget:$dst, pred:$p)>,
                  Requires<[IsThumb, IsNotDarwin]>;
     def tTAILJMPrND : tPseudoExpand<(outs), (ins tcGPR:$dst, variable_ops),
                      4, IIC_Br, [],
@@ -574,11 +549,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 }
 
 
-// A8.6.218 Supervisor Call (Software Interrupt) -- for disassembly only
+// A8.6.218 Supervisor Call (Software Interrupt)
 // A8.6.16 B: Encoding T1
 // If Inst{11-8} == 0b1111 then SEE SVC
 let isCall = 1, Uses = [SP] in
-def tSVC : T1pI<(outs), (ins i32imm:$imm), IIC_Br,
+def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
                 "svc", "\t$imm", []>, Encoding16 {
   bits<8> imm;
   let Inst{15-12} = 0b1101;
@@ -653,17 +628,17 @@ defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2,
 
 let AddedComplexity = 10 in
 def tLDRSB :                    // A8.6.80
-  T1pILdStEncode<0b011, (outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+  T1pILdStEncode<0b011, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
                  AddrModeT1_1, IIC_iLoad_bh_r,
-                 "ldrsb", "\t$dst, $addr",
-                 [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>;
+                 "ldrsb", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi8 t_addrmode_rr:$addr))]>;
 
 let AddedComplexity = 10 in
 def tLDRSH :                    // A8.6.84
-  T1pILdStEncode<0b111, (outs tGPR:$dst), (ins t_addrmode_rr:$addr),
+  T1pILdStEncode<0b111, (outs tGPR:$Rt), (ins t_addrmode_rr:$addr),
                  AddrModeT1_2, IIC_iLoad_bh_r,
-                 "ldrsh", "\t$dst, $addr",
-                 [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>;
+                 "ldrsh", "\t$Rt, $addr",
+                 [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>;
 
 let canFoldAsLoad = 1 in
 def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
@@ -678,7 +653,7 @@ def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
 
 // Load tconstpool
 // FIXME: Use ldr.n to work around a Darwin assembler bug.
-let canFoldAsLoad = 1, isReMaterializable = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
 def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
                   "ldr", ".n\t$Rt, $addr",
                   [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
@@ -736,42 +711,53 @@ def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i,
 //  Load / store multiple Instructions.
 //
 
-multiclass thumb_ldst_mult<string asm, InstrItinClass itin,
-                           InstrItinClass itin_upd, bits<6> T1Enc,
-                           bit L_bit> {
-  def IA :
-    T1I<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-        itin, !strconcat(asm, "ia${p}\t$Rn, $regs"), []>,
-       T1Encoding<T1Enc> {
-    bits<3> Rn;
-    bits<8> regs;
-    let Inst{10-8} = Rn;
-    let Inst{7-0}  = regs;
-  }
-  def IA_UPD :
-    T1It<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-         itin_upd, !strconcat(asm, "ia${p}\t$Rn!, $regs"), "$Rn = $wb", []>,
-        T1Encoding<T1Enc> {
-    bits<3> Rn;
-    bits<8> regs;
-    let Inst{10-8} = Rn;
-    let Inst{7-0}  = regs;
-  }
-}
-
 // These require base address to be written back or one of the loaded regs.
 let neverHasSideEffects = 1 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
-defm tLDM : thumb_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu,
-                            {1,1,0,0,1,?}, 1>;
-
+def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+        IIC_iLoad_m, "ldm${p}\t$Rn, $regs", []>, T1Encoding<{1,1,0,0,1,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
+
+// Writeback version is just a pseudo, as there's no encoding difference.
+// Writeback happens iff the base register is not in the destination register
+// list.
+def tLDMIA_UPD :
+    InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain,
+                 "$Rn = $wb", IIC_iLoad_mu>,
+    PseudoInstExpansion<(tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)> {
+  let Size = 2;
+  let OutOperandList = (outs GPR:$wb);
+  let InOperandList = (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops);
+  let Pattern = [];
+  let isCodeGenOnly = 1;
+  let isPseudo = 1;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+// There is no non-writeback version of STM for Thumb.
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
-defm tSTM : thumb_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu,
-                            {1,1,0,0,0,?}, 0>;
+def tSTMIA_UPD : Thumb1I<(outs GPR:$wb),
+                         (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+                         AddrModeNone, 2, IIC_iStore_mu,
+                         "stm${p}\t$Rn!, $regs", "$Rn = $wb", []>,
+                     T1Encoding<{1,1,0,0,0,?}> {
+  bits<3> Rn;
+  bits<8> regs;
+  let Inst{10-8} = Rn;
+  let Inst{7-0}  = regs;
+}
 
 } // neverHasSideEffects
 
+def : InstAlias<"ldm${p} $Rn!, $regs",
+                (tLDMIA tGPR:$Rn, pred:$p, reglist:$regs)>,
+        Requires<[IsThumb, IsThumb1Only]>;
+
 let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
 def tPOP : T1I<(outs), (ins pred:$p, reglist:$regs, variable_ops),
                IIC_iPop,
@@ -876,7 +862,7 @@ def tADC :                      // A8.6.2
 
 // Add immediate
 def tADDi3 :                    // A8.6.4 T1
-  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3),
+  T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "add", "\t$Rd, $Rm, $imm3",
                    [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> {
@@ -885,8 +871,8 @@ def tADDi3 :                    // A8.6.4 T1
 }
 
 def tADDi8 :                    // A8.6.4 T2
-  T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8),
-                    IIC_iALUi,
+  T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "add", "\t$Rdn, $imm8",
                     [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>;
 
@@ -920,10 +906,10 @@ def tAND :                      // A8.6.12
 
 // ASR immediate
 def tASRri :                    // A8.6.14
-  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+  T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "asr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm:$imm5)))]> {
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -962,7 +948,7 @@ def tCMNz :                     // A8.6.33
 
 // CMP immediate
 let isCompare = 1, Defs = [CPSR] in {
-def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, i32imm:$imm8), IIC_iCMPi,
+def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
                   "cmp", "\t$Rn, $imm8",
                   [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
              T1General<{1,0,1,?,?}> {
@@ -1003,7 +989,7 @@ def tEOR :                      // A8.6.45
 
 // LSL immediate
 def tLSLri :                    // A8.6.88
-  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+  T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
                    IIC_iMOVsi,
                    "lsl", "\t$Rd, $Rm, $imm5",
                    [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> {
@@ -1020,10 +1006,10 @@ def tLSLrr :                    // A8.6.89
 
 // LSR immediate
 def tLSRri :                    // A8.6.90
-  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm5),
+  T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "lsr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm:$imm5)))]> {
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -1047,6 +1033,10 @@ def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
   let Inst{10-8} = Rd;
   let Inst{7-0}  = imm8;
 }
+// Because we have an explicit tMOVSr below, we need an alias to handle
+// the immediate "movs" form here. Blech.
+def : tInstAlias <"movs $Rdn, $imm",
+                 (tMOVi8 tGPR:$Rdn, CPSR, imm0_255:$imm, 14, 0)>;
 
 // A7-73: MOV(2) - mov setting flag.
 
@@ -1077,10 +1067,19 @@ def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
 // Multiply register
 let isCommutable = 1 in
 def tMUL :                      // A8.6.105 T1
-  T1sItDPEncode<0b1101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
-                IIC_iMUL32,
-                "mul", "\t$Rdn, $Rm, $Rdn",
-                [(set tGPR:$Rdn, (mul tGPR:$Rn, tGPR:$Rm))]>;
+  Thumb1sI<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm), AddrModeNone, 2,
+           IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm", "$Rm = $Rd",
+           [(set tGPR:$Rd, (mul tGPR:$Rn, tGPR:$Rm))]>,
+      T1DataProcessing<0b1101> {
+  bits<3> Rd;
+  bits<3> Rn;
+  let Inst{5-3} = Rn;
+  let Inst{2-0} = Rd;
+  let AsmMatchConverter = "cvtThumbMultiply";
+}
+
+def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
+                                               pred:$p)>;
 
 // Move inverse register
 def tMVN :                      // A8.6.107
@@ -1132,6 +1131,9 @@ def tRSB :                      // A8.6.141
                "rsb", "\t$Rd, $Rn, #0",
                [(set tGPR:$Rd, (ineg tGPR:$Rn))]>;
 
+def : tInstAlias<"neg${s}${p} $Rd, $Rm",
+                 (tRSB tGPR:$Rd, s_cc_out:$s, tGPR:$Rm, pred:$p)>;
+
 // Subtract with carry register
 let Uses = [CPSR] in
 def tSBC :                      // A8.6.151
@@ -1142,7 +1144,7 @@ def tSBC :                      // A8.6.151
 
 // Subtract immediate
 def tSUBi3 :                    // A8.6.210 T1
-  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, i32imm:$imm3),
+  T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "sub", "\t$Rd, $Rm, $imm3",
                    [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> {
@@ -1151,8 +1153,8 @@ def tSUBi3 :                    // A8.6.210 T1
 }
 
 def tSUBi8 :                    // A8.6.210 T2
-  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn), (ins tGPR:$Rn, i32imm:$imm8),
-                    IIC_iALUi,
+  T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
+                    (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "sub", "\t$Rdn, $imm8",
                     [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>;
 
@@ -1163,8 +1165,6 @@ def tSUBrr :                    // A8.6.212
                 "sub", "\t$Rd, $Rn, $Rm",
                 [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>;
 
-// TODO: A7-96: STMIA - store multiple.
-
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
   T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1216,12 +1216,13 @@ let usesCustomInserter = 1 in  // Expanded after instruction selection.
 // assembler.
 
 def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
-               IIC_iALUi, "adr{$p}\t$Rd, #$addr", []>,
+               IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
                T1Encoding<{1,0,1,0,0,?}> {
   bits<3> Rd;
   bits<8> addr;
   let Inst{10-8} = Rd;
   let Inst{7-0} = addr;
+  let DecoderMethod = "DecodeThumbAddSpecialReg";
 }
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
@@ -1361,6 +1362,31 @@ def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr),
 def : T1Pat<(sextloadi16 t_addrmode_is2:$addr),
             (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>;
 
+def : T1Pat<(atomic_load_8 t_addrmode_is1:$src),
+             (tLDRBi t_addrmode_is1:$src)>;
+def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src),
+             (tLDRBr t_addrmode_rrs1:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_is2:$src),
+             (tLDRHi t_addrmode_is2:$src)>;
+def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src),
+             (tLDRHr t_addrmode_rrs2:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_is4:$src),
+             (tLDRi t_addrmode_is4:$src)>;
+def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src),
+             (tLDRr t_addrmode_rrs4:$src)>;
+def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val),
+             (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>;
+def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val),
+             (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val),
+             (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>;
+def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val),
+             (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val),
+             (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>;
+def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val),
+             (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>;
+
 // Large immediate handling.
 
 // Two piece imms.
@@ -1395,3 +1421,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                   2, IIC_Br, [(brind GPR:$Rm)],
                   (tMOVr PC, GPR:$Rm, pred:$p)>;
 }
+
+
+// In Thumb1, "nop" is encoded as a "mov r8, r8". Technically, the bf00
+// encoding is available on ARMv6K, but we don't differentiate that finely.
+def : InstAlias<"nop", (tMOVr R8, R8, 14, 0)>,Requires<[IsThumb, IsThumb1Only]>;
+
+
+// For round-trip assembly/disassembly, we have to handle a CPS instruction
+// without any iflags. That's not, strictly speaking, valid syntax, but it's
+// a useful extention and assembles to defined behaviour (the insn does
+// nothing).
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
+def : tInstAlias<"cps$imod", (tCPS imod_op:$imod, 0)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index c2c6cbc..471ec29 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -12,13 +12,32 @@
 //===----------------------------------------------------------------------===//
 
 // IT block predicate field
+def it_pred_asmoperand : AsmOperandClass {
+  let Name = "ITCondCode";
+  let ParserMethod = "parseITCondCode";
+}
 def it_pred : Operand<i32> {
   let PrintMethod = "printMandatoryPredicateOperand";
+  let ParserMatchClass = it_pred_asmoperand;
 }
 
 // IT block condition mask
+def it_mask_asmoperand : AsmOperandClass { let Name = "ITMask"; }
 def it_mask : Operand<i32> {
   let PrintMethod = "printThumbITMask";
+  let ParserMatchClass = it_mask_asmoperand;
+}
+
+// t2_shift_imm: An integer that encodes a shift amount and the type of shift
+// (asr or lsl). The 6-bit immediate encodes as:
+//    {5}     0 ==> lsl
+//            1     asr
+//    {4-0}   imm5 shift amount.
+//            asr #32 not allowed
+def t2_shift_imm : Operand<i32> {
+  let PrintMethod = "printShiftImmOperand";
+  let ParserMatchClass = ShifterImmAsmOperand;
+  let DecoderMethod = "DecodeT2ShifterImmOperand";
 }
 
 // Shifted operands. No register controlled shifts for Thumb2.
@@ -28,6 +47,8 @@ def t2_so_reg : Operand<i32>,    // reg imm
                                [shl,srl,sra,rotr]> {
   let EncoderMethod = "getT2SORegOpValue";
   let PrintMethod = "printT2SOOperand";
+  let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
   let MIOperandInfo = (ops rGPR, i32imm);
 }
 
@@ -50,6 +71,7 @@ def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
   }]> {
   let ParserMatchClass = t2_so_imm_asmoperand;
   let EncoderMethod = "getT2SOImmOpValue";
+  let DecoderMethod = "DecodeT2SOImm";
 }
 
 // t2_so_imm_not - Match an immediate that is a complement
@@ -65,11 +87,6 @@ def t2_so_imm_neg : Operand<i32>,
   return ARM_AM::getT2SOImmVal(-((uint32_t)N->getZExtValue())) != -1;
 }], t2_so_imm_neg_XFORM>;
 
-/// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
-def imm1_31 : ImmLeaf<i32, [{
-  return (int32_t)Imm >= 1 && (int32_t)Imm < 32;
-}]>;
-
 /// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
 def imm0_4095 : Operand<i32>,
                 ImmLeaf<i32, [{
@@ -96,17 +113,20 @@ def lo5AllOne : PatLeaf<(i32 imm), [{
 // Define Thumb2 specific addressing modes.
 
 // t2addrmode_imm12  := reg + imm12
+def t2addrmode_imm12_asmoperand : AsmOperandClass {let Name="MemUImm12Offset";}
 def t2addrmode_imm12 : Operand<i32>,
                        ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
   let PrintMethod = "printAddrModeImm12Operand";
   let EncoderMethod = "getAddrModeImm12OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm12";
+  let ParserMatchClass = t2addrmode_imm12_asmoperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 // t2ldrlabel  := imm12
 def t2ldrlabel : Operand<i32> {
   let EncoderMethod = "getAddrModeImm12OpValue";
+  let PrintMethod = "printT2LdrLabelOperand";
 }
 
 
@@ -116,13 +136,36 @@ def t2adrlabel : Operand<i32> {
 }
 
 
+// t2addrmode_posimm8  := reg + imm8
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def t2addrmode_posimm8 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemPosImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_negimm8  := reg - imm8
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def t2addrmode_negimm8 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemNegImm8OffsetAsmOperand;
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
 // t2addrmode_imm8  := reg +/- imm8
+def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
 def t2addrmode_imm8 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
   let PrintMethod = "printT2AddrModeImm8Operand";
   let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8";
+  let ParserMatchClass = MemImm8OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
 def t2am_imm8_offset : Operand<i32>,
@@ -130,38 +173,61 @@ def t2am_imm8_offset : Operand<i32>,
                                       [], [SDNPWantRoot]> {
   let PrintMethod = "printT2AddrModeImm8OffsetOperand";
   let EncoderMethod = "getT2AddrModeImm8OffsetOpValue";
-  let ParserMatchClass = MemMode5AsmOperand;
+  let DecoderMethod = "DecodeT2Imm8";
 }
 
 // t2addrmode_imm8s4  := reg +/- (imm8 << 2)
+def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
 def t2addrmode_imm8s4 : Operand<i32> {
   let PrintMethod = "printT2AddrModeImm8s4Operand";
   let EncoderMethod = "getT2AddrModeImm8s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm8s4";
+  let ParserMatchClass = MemImm8s4OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
+def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
 def t2am_imm8s4_offset : Operand<i32> {
   let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
+  let EncoderMethod = "getT2Imm8s4OpValue";
+  let DecoderMethod = "DecodeT2Imm8S4";
+}
+
+// t2addrmode_imm0_1020s4  := reg + (imm8 << 2)
+def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
+  let Name = "MemImm0_1020s4Offset";
+}
+def t2addrmode_imm0_1020s4 : Operand<i32> {
+  let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
+  let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
+  let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
+  let ParserMatchClass = MemImm0_1020s4OffsetAsmOperand;
+  let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
 }
 
 // t2addrmode_so_reg  := reg + (reg << imm2)
+def t2addrmode_so_reg_asmoperand : AsmOperandClass {let Name="T2MemRegOffset";}
 def t2addrmode_so_reg : Operand<i32>,
                         ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
   let PrintMethod = "printT2AddrModeSoRegOperand";
   let EncoderMethod = "getT2AddrModeSORegOpValue";
+  let DecoderMethod = "DecodeT2AddrModeSOReg";
+  let ParserMatchClass = t2addrmode_so_reg_asmoperand;
   let MIOperandInfo = (ops GPR:$base, rGPR:$offsreg, i32imm:$offsimm);
-  let ParserMatchClass = MemMode5AsmOperand;
 }
 
-// t2addrmode_reg := reg
-// Used by load/store exclusive instructions. Useful to enable right assembly
-// parsing and printing. Not used for any codegen matching.
-//
-def t2addrmode_reg : Operand<i32> {
-  let PrintMethod = "printAddrMode7Operand";
-  let MIOperandInfo = (ops GPR);
-  let ParserMatchClass = MemMode7AsmOperand;
+// Addresses for the TBB/TBH instructions.
+def addrmode_tbb_asmoperand : AsmOperandClass { let Name = "MemTBB"; }
+def addrmode_tbb : Operand<i32> {
+  let PrintMethod = "printAddrModeTBB";
+  let ParserMatchClass = addrmode_tbb_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
+}
+def addrmode_tbh_asmoperand : AsmOperandClass { let Name = "MemTBH"; }
+def addrmode_tbh : Operand<i32> {
+  let PrintMethod = "printAddrModeTBH";
+  let ParserMatchClass = addrmode_tbh_asmoperand;
+  let MIOperandInfo = (ops GPR:$Rn, rGPR:$Rm);
 }
 
 //===----------------------------------------------------------------------===//
@@ -419,47 +485,6 @@ class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
 }
 
 
-/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
-/// unary operation that produces a value. These are predicable and can be
-/// changed to modify CPSR.
-multiclass T2I_un_irs<bits<4> opcod, string opc,
-                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                      PatFrag opnode, bit Cheap = 0, bit ReMat = 0> {
-   // shifted imm
-   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
-                opc, "\t$Rd, $imm",
-                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
-     let isAsCheapAsAMove = Cheap;
-     let isReMaterializable = ReMat;
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15} = 0;
-   }
-   // register
-   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
-                opc, ".w\t$Rd, $Rm",
-                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{14-12} = 0b000; // imm3
-     let Inst{7-6} = 0b00; // imm2
-     let Inst{5-4} = 0b00; // type
-   }
-   // shifted register
-   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
-                opc, ".w\t$Rd, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-   }
-}
-
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
 /// binary operation that produces a value. These are predicable and can be
 /// changed to modify CPSR.
@@ -500,21 +525,18 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    }
   // Assembly aliases for optional destination operand when it's the same
   // as the source operand.
-  def : InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
+  def : t2InstAlias<!strconcat(opc, "${s}${p} $Rdn, $imm"),
      (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
                                                     t2_so_imm:$imm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsThumb2]>;
-  def : InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $Rm"),
      (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
                                                     rGPR:$Rm, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsThumb2]>;
-  def : InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", wide, " $Rdn, $shift"),
      (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn,
                                                     t2_so_reg:$shift, pred:$p,
-                                                    cc_out:$s)>,
-     Requires<[IsThumb2]>;
+                                                    cc_out:$s)>;
 }
 
 /// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
@@ -522,7 +544,27 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
 multiclass T2I_bin_w_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
                          PatFrag opnode, string baseOpc, bit Commutable = 0> :
-    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, baseOpc, Commutable, ".w">;
+    T2I_bin_irs<opcod, opc, iii, iir, iis, opnode, baseOpc, Commutable, ".w"> {
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rd, rGPR:$Rn,
+                                                    t2_so_reg:$shift, pred:$p,
+                                                    cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rs")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    t2_so_reg:$shift, pred:$p,
+                                                    cc_out:$s)>;
+}
 
 /// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
 /// reversed.  The 'rr' form is only defined for the disassembler; for codegen
@@ -563,45 +605,28 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
 
 /// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
 /// instruction modifies the CPSR register.
-let isCodeGenOnly = 1, Defs = [CPSR] in {
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
 multiclass T2I_bin_s_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
                          PatFrag opnode, bit Commutable = 0> {
    // shifted imm
-   def ri : T2TwoRegImm<
+   def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), iii,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm",
-                [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> {
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{15} = 0;
-   }
+                opc, ".w\t$Rd, $Rn, $imm",
+                [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, t2_so_imm:$imm))]>;
    // register
-   def rr : T2ThreeReg<
+   def rr : T2sThreeReg<
                 (outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), iir,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $Rm",
-                [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> {
-     let isCommutable = Commutable;
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{14-12} = 0b000; // imm3
-     let Inst{7-6} = 0b00; // imm2
-     let Inst{5-4} = 0b00; // type
-   }
+                opc, ".w\t$Rd, $Rn, $Rm",
+                [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, rGPR:$Rm))]>;
    // shifted register
-   def rs : T2TwoRegShiftedReg<
+   def rs : T2sTwoRegShiftedReg<
                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-   }
+                opc, ".w\t$Rd, $Rn, $ShiftedRm",
+               [(set rGPR:$Rd, CPSR, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]>;
 }
 }
 
@@ -614,9 +639,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    // in particular for taking the address of a local.
    let isReMaterializable = 1 in {
    def ri : T2sTwoRegImm<
-                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
-                 opc, ".w\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_imm:$imm))]> {
+               (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
+               opc, ".w\t$Rd, $Rn, $imm",
+               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24} = 1;
@@ -626,9 +651,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    }
    // 12-bit imm
    def ri12 : T2I<
-                  (outs rGPR:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
+                  (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
                   !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
-                  [(set rGPR:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
+                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
      bits<4> Rd;
      bits<4> Rn;
      bits<12> imm;
@@ -644,9 +669,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
      let Inst{7-0} = imm{7-0};
    }
    // register
-   def rr : T2sThreeReg<(outs rGPR:$Rd), (ins GPR:$Rn, rGPR:$Rm), IIC_iALUr,
-                 opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode GPR:$Rn, rGPR:$Rm))]> {
+   def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
+                 IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
+                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -658,9 +683,9 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    }
    // shifted register
    def rs : T2sTwoRegShiftedReg<
-                 (outs rGPR:$Rd), (ins GPR:$Rn, t2_so_reg:$ShiftedRm),
+                 (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode GPR:$Rn, t2_so_reg:$ShiftedRm))]> {
+              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24} = 1;
@@ -671,13 +696,13 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
 /// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
 /// for a binary operation that produces a value and use the carry
 /// bit. It's not predicable.
-let Uses = [CPSR] in {
+let Defs = [CPSR], Uses = [CPSR] in {
 multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                              bit Commutable = 0> {
    // shifted imm
    def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
                  IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
+               [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
@@ -687,7 +712,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    // register
    def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
                  opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
                  Requires<[IsThumb2]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
@@ -701,7 +726,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    def rs : T2sTwoRegShiftedReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
+         [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
                  Requires<[IsThumb2]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -710,64 +735,35 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
 }
 }
 
-// Carry setting variants
-// NOTE: CPSR def omitted because it will be handled by the custom inserter.
-let usesCustomInserter = 1 in {
-multiclass T2I_adde_sube_s_irs<PatFrag opnode, bit Commutable = 0> {
-   // shifted imm
-   def ri : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
-                4, IIC_iALUi,
-                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>;
-   // register
-   def rr : t2PseudoInst<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
-                4, IIC_iALUr,
-                [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
-     let isCommutable = Commutable;
-   }
-   // shifted register
-   def rs : t2PseudoInst<
-                (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
-                4, IIC_iALUsi,
-                [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>;
-}
-}
-
 /// T2I_rbin_s_is - Same as T2I_rbin_irs except sets 's' bit and the register
 /// version is not needed since this is only for codegen.
-let isCodeGenOnly = 1, Defs = [CPSR] in {
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, isCodeGenOnly = 1, isPseudo = 1, Defs = [CPSR] in {
 multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
    // shifted imm
-   def ri : T2TwoRegImm<
+   def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
-                !strconcat(opc, "s"), ".w\t$Rd, $Rn, $imm",
-                [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> {
-     let Inst{31-27} = 0b11110;
-     let Inst{25} = 0;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-     let Inst{15} = 0;
-   }
+                opc, ".w\t$Rd, $Rn, $imm",
+                [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm, rGPR:$Rn))]>;
    // shifted register
-   def rs : T2TwoRegShiftedReg<
+   def rs : T2sTwoRegShiftedReg<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
-                IIC_iALUsi, !strconcat(opc, "s"), "\t$Rd, $Rn, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> {
-     let Inst{31-27} = 0b11101;
-     let Inst{26-25} = 0b01;
-     let Inst{24-21} = opcod;
-     let Inst{20} = 1; // The S bit.
-   }
+                IIC_iALUsi, opc, "\t$Rd, $Rn, $ShiftedRm",
+              [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>;
 }
 }
 
 /// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
 //  rotate operation that produces a value.
-multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
+multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode,
+                     string baseOpc> {
    // 5-bit imm
    def ri : T2sTwoRegShiftImm<
-                 (outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$imm), IIC_iMOVsi,
+                 (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
                  opc, ".w\t$Rd, $Rm, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm, imm1_31:$imm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-21} = 0b010010;
      let Inst{19-16} = 0b1111; // Rn
@@ -784,20 +780,50 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
      let Inst{15-12} = 0b1111;
      let Inst{7-4} = 0b0000;
    }
+
+  // Optional destination register
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    ty:$imm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", ".w $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+
+  // Assembler aliases w/o the ".w" suffix.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rd, rGPR:$Rn,
+                                                    ty:$imm, pred:$p,
+                                                   cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rd, $Rn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rd, rGPR:$Rn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
+
+  // and with the optional destination operand, too.
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    ty:$imm, pred:$p,
+                                                    cc_out:$s)>;
+  def : t2InstAlias<!strconcat(opc, "${s}${p}", " $Rdn, $Rm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rr")) rGPR:$Rdn, rGPR:$Rdn,
+                                                    rGPR:$Rm, pred:$p,
+                                                    cc_out:$s)>;
 }
 
 /// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
 /// patterns. Similar to T2I_bin_irs except the instruction does not produce
 /// a explicit result, only implicitly set CPSR.
-let isCompare = 1, Defs = [CPSR] in {
 multiclass T2I_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode> {
+                       PatFrag opnode, string baseOpc> {
+let isCompare = 1, Defs = [CPSR] in {
    // shifted imm
    def ri : T2OneRegCmpImm<
-                (outs), (ins GPR:$Rn, t2_so_imm:$imm), iii,
+                (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
                 opc, ".w\t$Rn, $imm",
-                [(opnode GPR:$Rn, t2_so_imm:$imm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -807,9 +833,9 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc,
    }
    // register
    def rr : T2TwoRegCmp<
-                (outs), (ins GPR:$lhs, rGPR:$rhs), iir,
-                opc, ".w\t$lhs, $rhs",
-                [(opnode GPR:$lhs, rGPR:$rhs)]> {
+                (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
+                opc, ".w\t$Rn, $Rm",
+                [(opnode GPRnopc:$Rn, rGPR:$Rm)]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -821,9 +847,9 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc,
    }
    // shifted register
    def rs : T2OneRegCmpShiftedReg<
-                (outs), (ins GPR:$Rn, t2_so_reg:$ShiftedRm), iis,
+                (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
                 opc, ".w\t$Rn, $ShiftedRm",
-                [(opnode GPR:$Rn, t2_so_reg:$ShiftedRm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -831,55 +857,60 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc,
      let Inst{11-8} = 0b1111; // Rd
    }
 }
+
+  // Assembler aliases w/o the ".w" suffix.
+  // No alias here for 'rr' version as not all instantiations of this
+  // multiclass want one (CMP in particular, does not).
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $imm"),
+     (!cast<Instruction>(!strconcat(baseOpc, "ri")) GPRnopc:$Rn,
+                                                    t2_so_imm:$imm, pred:$p)>;
+  def : t2InstAlias<!strconcat(opc, "${p}", " $Rn, $shift"),
+     (!cast<Instruction>(!strconcat(baseOpc, "rs")) GPRnopc:$Rn,
+                                                    t2_so_reg:$shift,
+                                                    pred:$p)>;
 }
 
 /// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
 multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
-                  InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
-  def i12 : T2Ii12<(outs GPR:$Rt), (ins t2addrmode_imm12:$addr), iii,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode t2addrmode_imm12:$addr))]> {
-    let Inst{31-27} = 0b11111;
-    let Inst{26-25} = 0b00;
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+    bits<4> Rt;
+    bits<17> addr;
+    let Inst{31-25} = 0b1111100;
     let Inst{24} = signed;
     let Inst{23} = 1;
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
-
-    bits<4> Rt;
-    let Inst{15-12} = Rt;
-
-    bits<17> addr;
-    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
-    let Inst{23}    = addr{12};    // U
+    let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};  // imm
   }
-  def i8  : T2Ii8 <(outs GPR:$Rt), (ins t2addrmode_imm8:$addr), iii,
+  def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode t2addrmode_imm8:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+    bits<4> Rt;
+    bits<13> addr;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
     let Inst{23} = 0;
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
+    let Inst{19-16} = addr{12-9}; // Rn
+    let Inst{15-12} = Rt;
     let Inst{11} = 1;
     // Offset: index==TRUE, wback==FALSE
     let Inst{10} = 1; // The P bit.
-    let Inst{8} = 0; // The W bit.
-
-    bits<4> Rt;
-    let Inst{15-12} = Rt;
-
-    bits<13> addr;
-    let Inst{19-16} = addr{12-9}; // Rn
     let Inst{9}     = addr{8};    // U
+    let Inst{8} = 0; // The W bit.
     let Inst{7-0}   = addr{7-0};  // imm
   }
-  def s   : T2Iso <(outs GPR:$Rt), (ins t2addrmode_so_reg:$addr), iis,
+  def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
@@ -895,12 +926,14 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{19-16} = addr{9-6}; // Rn
     let Inst{3-0}   = addr{5-2}; // Rm
     let Inst{5-4}   = addr{1-0}; // imm
+
+    let DecoderMethod = "DecodeT2LoadShift";
   }
 
   // FIXME: Is the pci variant actually needed?
-  def pci : T2Ipc <(outs GPR:$Rt), (ins t2ldrlabel:$addr), iii,
+  def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set GPR:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
     let isReMaterializable = 1;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
@@ -918,10 +951,11 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
 
 /// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
 multiclass T2I_st<bits<2> opcod, string opc,
-                  InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
-  def i12 : T2Ii12<(outs), (ins GPR:$Rt, t2addrmode_imm12:$addr), iii,
+                  InstrItinClass iii, InstrItinClass iis, RegisterClass target,
+                  PatFrag opnode> {
+  def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode GPR:$Rt, t2addrmode_imm12:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0001;
     let Inst{22-21} = opcod;
@@ -936,9 +970,9 @@ multiclass T2I_st<bits<2> opcod, string opc,
     let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm
   }
-  def i8  : T2Ii8 <(outs), (ins GPR:$Rt, t2addrmode_imm8:$addr), iii,
+  def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(opnode GPR:$Rt, t2addrmode_imm8:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -956,9 +990,9 @@ multiclass T2I_st<bits<2> opcod, string opc,
     let Inst{9}     = addr{8};    // U
     let Inst{7-0}   = addr{7-0};  // imm
   }
-  def s   : T2Iso <(outs), (ins GPR:$Rt, t2addrmode_so_reg:$addr), iis,
+  def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode GPR:$Rt, t2addrmode_so_reg:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -977,146 +1011,81 @@ multiclass T2I_st<bits<2> opcod, string opc,
 
 /// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
-                  opc, ".w\t$Rd, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-                  opc, ".w\t$Rd, $Rm, ror $rot",
-                 [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, ".w\t$Rd, $Rm$rot",
+             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+             Requires<[IsThumb2]> {
+   let Inst{31-27} = 0b11111;
+   let Inst{26-23} = 0b0100;
+   let Inst{22-20} = opcod;
+   let Inst{19-16} = 0b1111; // Rn
+   let Inst{15-12} = 0b1111;
+   let Inst{7} = 1;
+
+   bits<2> rot;
+   let Inst{5-4} = rot{1-0}; // rotate
 }
 
 // UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
-multiclass T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode> {
-  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
-                  opc, "\t$Rd, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm))]>,
-                 Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def r_rot : T2TwoReg<(outs rGPR:$dst), (ins rGPR:$Rm, rot_imm:$rot),
-                  IIC_iEXTr, opc, "\t$dst, $Rm, ror $rot",
-                 [(set rGPR:$dst, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-                 Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
+             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
+            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
+          Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
 // SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
 // supported yet.
-multiclass T2I_ext_rrot_sxtb16<bits<3> opcod, string opc> {
-  def r     : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iEXTr,
-                  opc, "\t$Rd, $Rm", []>,
+class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
+  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
+             opc, "\t$Rd, $Rm$rot", []>,
           Requires<[IsThumb2, HasT2ExtractPack]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def r_rot : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, i32imm:$rot), IIC_iEXTr,
-                  opc, "\t$Rd, $Rm, ror $rot", []>,
-          Requires<[IsThumb2, HasT2ExtractPack]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{19-16} = 0b1111; // Rn
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-      bits<2> rot;
-      let Inst{5-4} = rot{1-0}; // rotate
-   }
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
 /// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-multiclass T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode> {
-  def rr     : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr,
-                  opc, "\t$Rd, $Rn, $Rm",
-                  [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def rr_rot : T2ThreeReg<(outs rGPR:$Rd),
-                  (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
-                  IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot",
-                  [(set rGPR:$Rd, (opnode rGPR:$Rn,
-                                          (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+  : T2ThreeReg<(outs rGPR:$Rd),
+               (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
+             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
+           Requires<[HasT2ExtractPack, IsThumb2]> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
-// DO variant - disassembly only, no pattern
-
-multiclass T2I_exta_rrot_DO<bits<3> opcod, string opc> {
-  def rr     : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iEXTAr,
-                  opc, "\t$Rd, $Rn, $Rm", []> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-     let Inst{5-4} = 0b00; // rotate
-   }
-  def rr_rot :T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, i32imm:$rot),
-                  IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm, ror $rot", []> {
-     let Inst{31-27} = 0b11111;
-     let Inst{26-23} = 0b0100;
-     let Inst{22-20} = opcod;
-     let Inst{15-12} = 0b1111;
-     let Inst{7} = 1;
-
-     bits<2> rot;
-     let Inst{5-4} = rot{1-0}; // rotate
-   }
+class T2I_exta_rrot_np<bits<3> opcod, string opc>
+  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
+               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []> {
+  bits<2> rot;
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0100;
+  let Inst{22-20} = opcod;
+  let Inst{15-12} = 0b1111;
+  let Inst{7} = 1;
+  let Inst{5-4} = rot;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1143,7 +1112,7 @@ class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
 // assembler.
 def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
               (ins t2adrlabel:$addr, pred:$p),
-              IIC_iALUi, "adr{$p}.w\t$Rd, #$addr", []> {
+              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-24} = 0b10;
   // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
@@ -1160,6 +1129,8 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
   let Inst{26}    = addr{11};
   let Inst{14-12} = addr{10-8};
   let Inst{7-0}   = addr{7-0};
+
+  let DecoderMethod = "DecodeT2Adr";
 }
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
@@ -1177,33 +1148,33 @@ def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
 
 // Load
 let canFoldAsLoad = 1, isReMaterializable = 1  in
-defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si,
+defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
                       UnOpFrag<(load node:$Src)>>;
 
 // Loads with zero extension
 defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(zextloadi16 node:$Src)>>;
+                      rGPR, UnOpFrag<(zextloadi16 node:$Src)>>;
 defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(zextloadi8  node:$Src)>>;
+                      rGPR, UnOpFrag<(zextloadi8  node:$Src)>>;
 
 // Loads with sign extension
 defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(sextloadi16 node:$Src)>>;
+                      rGPR, UnOpFrag<(sextloadi16 node:$Src)>>;
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      UnOpFrag<(sextloadi8  node:$Src)>>;
+                      rGPR, UnOpFrag<(sextloadi8  node:$Src)>>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
-                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", []>;
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
 } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
 
 // zextload i1 -> zextload i8
 def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
             (t2LDRBi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(zextloadi1 t2addrmode_imm8:$addr),
-            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr),
             (t2LDRBs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
@@ -1214,8 +1185,8 @@ def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
 // earlier?
 def : T2Pat<(extloadi1  t2addrmode_imm12:$addr),
             (t2LDRBi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(extloadi1  t2addrmode_imm8:$addr),
-            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(extloadi1  t2addrmode_so_reg:$addr),
             (t2LDRBs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
@@ -1223,8 +1194,8 @@ def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
 
 def : T2Pat<(extloadi8  t2addrmode_imm12:$addr),
             (t2LDRBi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(extloadi8  t2addrmode_imm8:$addr),
-            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(extloadi8  t2addrmode_so_reg:$addr),
             (t2LDRBs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
@@ -1232,8 +1203,8 @@ def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
 
 def : T2Pat<(extloadi16 t2addrmode_imm12:$addr),
             (t2LDRHi12  t2addrmode_imm12:$addr)>;
-def : T2Pat<(extloadi16 t2addrmode_imm8:$addr),
-            (t2LDRHi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
 def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
             (t2LDRHs    t2addrmode_so_reg:$addr)>;
 def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
@@ -1247,83 +1218,86 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
 // Indexed loads
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
-def t2LDR_PRE  : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
-                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
 
-def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
-                          "ldr", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
+def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
-def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                         "ldrb", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
-
-def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                         "ldrh", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
-
-def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                        "ldrsb", "\t$Rt, [$Rn], $addr", "$base = $Rn",
-                            []>;
-
-def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn),
+                            "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+
+def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn",
-                            []>;
-def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$Rn),
-                            (ins GPR:$base, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                        "ldrsh", "\t$dst, [$Rn], $addr", "$base = $Rn",
-                            []>;
+                            "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
+                            []> {
+  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
+}
+def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
+                          (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
+                          AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 } // mayLoad = 1, neverHasSideEffects = 1
 
-// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110) and are
-// for disassembly only.
+// LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
-  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
+  : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
           "\t$Rt, $addr", []> {
+  bits<4> Rt;
+  bits<13> addr;
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = signed;
   let Inst{23} = 0;
   let Inst{22-21} = type;
   let Inst{20} = 1; // load
+  let Inst{19-16} = addr{12-9};
+  let Inst{15-12} = Rt;
   let Inst{11} = 1;
   let Inst{10-8} = 0b110; // PUW.
-
-  bits<4> Rt;
-  bits<13> addr;
-  let Inst{15-12} = Rt;
-  let Inst{19-16} = addr{12-9};
-  let Inst{7-0}   = addr{7-0};
+  let Inst{7-0} = addr{7-0};
 }
 
 def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
@@ -1333,67 +1307,97 @@ def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
 def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 
 // Store
-defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si,
+defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
                    BinOpFrag<(store node:$LHS, node:$RHS)>>;
 defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+                   rGPR, BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
 defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
-                   BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+                   rGPR, BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
 
 // Store doubleword
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
                        (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr),
-               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", []>;
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
 
 // Indexed stores
-def t2STR_PRE  : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                         "str", "\t$Rt, [$Rn, $addr]!",
-                         "$Rn = $base_wb,@earlyclobber $base_wb",
-             [(set GPR:$base_wb,
-                   (pre_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
-
-def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
-                          "str", "\t$Rt, [$Rn], $addr",
-                          "$Rn = $base_wb,@earlyclobber $base_wb",
-             [(set GPR:$base_wb,
-                  (post_store GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
-
-def t2STRH_PRE  : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+                            "str", "\t$Rt, $addr!",
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
+def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
-                        "strh", "\t$Rt, [$Rn, $addr]!",
-                        "$Rn = $base_wb,@earlyclobber $base_wb",
-        [(set GPR:$base_wb,
-              (pre_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
-
-def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
-                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strh", "\t$Rt, [$Rn], $addr",
-                         "$Rn = $base_wb,@earlyclobber $base_wb",
-       [(set GPR:$base_wb,
-             (post_truncsti16 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
+                        "strh", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
 
-def t2STRB_PRE  : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
-                        "strb", "\t$Rt, [$Rn, $addr]!",
-                        "$Rn = $base_wb,@earlyclobber $base_wb",
-         [(set GPR:$base_wb,
-               (pre_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
+                        "strb", "\t$Rt, $addr!",
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
+  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
+}
 
-def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
-                            (ins GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr),
+def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_iu,
+                          "str", "\t$Rt, $Rn$offset",
+                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+             [(set GPRnopc:$Rn_wb,
+                  (post_store rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
                             AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
-                         "strb", "\t$Rt, [$Rn], $addr",
-                         "$Rn = $base_wb,@earlyclobber $base_wb",
-        [(set GPR:$base_wb,
-              (post_truncsti8 GPR:$Rt, GPR:$Rn, t2am_imm8_offset:$addr))]>;
+                         "strh", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+       [(set GPRnopc:$Rn_wb,
+             (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
+                            (ins rGPR:$Rt, addr_offset_none:$Rn,
+                                 t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStore_bh_iu,
+                         "strb", "\t$Rt, $Rn$offset",
+                         "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
+        [(set GPRnopc:$Rn_wb,
+              (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
+                              t2am_imm8_offset:$offset))]>;
+
+// Pseudo-instructions for pattern matching the pre-indexed stores. We can't
+// put the patterns on the instruction definitions directly as ISel wants
+// the address base and offset to be separate operands, not a single
+// complex operand like we represent the instructions themselves. The
+// pseudos map between the two.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in {
+def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
+               (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
+               4, IIC_iStore_ru,
+      [(set GPRnopc:$Rn_wb,
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+}
+
 
 // STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
 // only.
@@ -1424,21 +1428,31 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 // ldrd / strd pre / post variants
 // For disassembly only.
 
-def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
-                 (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
-                 "ldrd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
+def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins t2addrmode_imm8s4:$addr), IIC_iLoad_d_ru,
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+  let AsmMatchConverter = "cvtT2LdrdPre";
+  let DecoderMethod = "DecodeT2LDRDPreInstruction";
+}
 
-def t2LDRD_POST : T2Ii8s4<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2),
-                 (ins GPR:$base, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru,
-                 "ldrd", "\t$Rt, $Rt2, [$base], $imm", []>;
+def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
+                 (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
+                 IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
 
-def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs),
-                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
-                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base, $imm]!", []>;
+def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
+                 "$addr.base = $wb", []> {
+  let AsmMatchConverter = "cvtT2StrdPre";
+  let DecoderMethod = "DecodeT2STRDPreInstruction";
+}
 
-def t2STRD_POST : T2Ii8s4<0, 1, 0, (outs),
-                 (ins rGPR:$Rt, rGPR:$Rt2, GPR:$base, t2am_imm8s4_offset:$imm),
-                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, [$base], $imm", []>;
+def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
+                 (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
+                      t2am_imm8s4_offset:$imm),
+                 IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
+                 "$addr.base = $wb", []>;
 
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
 // data/instruction access.  These are for disassembly only.
@@ -1463,9 +1477,9 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{11-0}  = addr{11-0};  // imm12
   }
 
-  def i8 : T2Ii8<(outs), (ins t2addrmode_imm8:$addr), IIC_Preload, opc,
+  def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
                 "\t$addr",
-               [(ARMPreload t2addrmode_imm8:$addr, (i32 write), (i32 instr))]> {
+            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // U = 0
@@ -1496,6 +1510,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{19-16} = addr{9-6}; // Rn
     let Inst{3-0}   = addr{5-2}; // Rm
     let Inst{5-4}   = addr{1-0}; // imm2
+
+    let DecoderMethod = "DecodeT2LoadShift";
   }
 }
 
@@ -1507,11 +1523,11 @@ defm t2PLI  : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
 //  Load / store multiple Instructions.
 //
 
-multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
+multiclass thumb2_ld_mult<string asm, InstrItinClass itin,
                             InstrItinClass itin_upd, bit L_bit> {
   def IA :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-         itin, !strconcat(asm, "ia${p}.w\t$Rn, $regs"), []> {
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1522,11 +1538,12 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 0;        // No writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
   def IA_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-          itin_upd, !strconcat(asm, "ia${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1537,11 +1554,12 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 1;        // Writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
   def DB :
     T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-         itin, !strconcat(asm, "db${p}.w\t$Rn, $regs"), []> {
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1552,11 +1570,12 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 0;        // No writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
   def DB_UPD :
     T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
-          itin_upd, !strconcat(asm, "db${p}.w\t$Rn, $regs"), "$Rn = $wb", []> {
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
     bits<4>  Rn;
     bits<16> regs;
 
@@ -1567,17 +1586,95 @@ multiclass thumb2_ldst_mult<string asm, InstrItinClass itin,
     let Inst{21}    = 1;        // Writeback
     let Inst{20}    = L_bit;
     let Inst{19-16} = Rn;
-    let Inst{15-0}  = regs;
+    let Inst{15}    = 0;
+    let Inst{14-0}  = regs{14-0};
   }
 }
 
 let neverHasSideEffects = 1 in {
 
 let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
-defm t2LDM : thumb2_ldst_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+defm t2LDM : thumb2_ld_mult<"ldm", IIC_iLoad_m, IIC_iLoad_mu, 1>;
+
+multiclass thumb2_st_mult<string asm, InstrItinClass itin,
+                            InstrItinClass itin_upd, bit L_bit> {
+  def IA :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "${p}.w\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def IA_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "${p}.w\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b01;     // Increment After
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB :
+    T2XI<(outs), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+         itin, !strconcat(asm, "db${p}\t$Rn, $regs"), []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 0;        // No writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+  def DB_UPD :
+    T2XIt<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, reglist:$regs, variable_ops),
+          itin_upd, !strconcat(asm, "db${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    bits<4>  Rn;
+    bits<16> regs;
+
+    let Inst{31-27} = 0b11101;
+    let Inst{26-25} = 0b00;
+    let Inst{24-23} = 0b10;     // Decrement Before
+    let Inst{22}    = 0;
+    let Inst{21}    = 1;        // Writeback
+    let Inst{20}    = L_bit;
+    let Inst{19-16} = Rn;
+    let Inst{15}    = 0;
+    let Inst{14}    = regs{14};
+    let Inst{13}    = 0;
+    let Inst{12-0}  = regs{12-0};
+  }
+}
+
 
 let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
-defm t2STM : thumb2_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
+defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 
 } // neverHasSideEffects
 
@@ -1587,7 +1684,7 @@ defm t2STM : thumb2_ldst_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 //
 
 let neverHasSideEffects = 1 in
-def t2MOVr : T2sTwoReg<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
                    "mov", ".w\t$Rd, $Rm", []> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -1596,6 +1693,10 @@ def t2MOVr : T2sTwoReg<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVr,
   let Inst{14-12} = 0b000;
   let Inst{7-4} = 0b0000;
 }
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                                 pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+                                               pred:$p, CPSR)>;
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
@@ -1610,12 +1711,20 @@ def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
   let Inst{15} = 0;
 }
 
-def : InstAlias<"mov${s}${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
-                                                 pred:$p, cc_out:$s)>,
-                Requires<[IsThumb2]>;
+// cc_out is handled as part of the explicit mnemonic in the parser for 'mov'.
+// Use aliases to get that to play nice here.
+def : t2InstAlias<"movs${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+def : t2InstAlias<"movs${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                pred:$p, CPSR)>;
+
+def : t2InstAlias<"mov${p}.w $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                                 pred:$p, zero_reg)>;
+def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
+                                               pred:$p, zero_reg)>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
-def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins i32imm_hilo16:$imm), IIC_iMOVi,
+def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
                    "movw", "\t$Rd, $imm",
                    [(set rGPR:$Rd, imm0_65535:$imm)]> {
   let Inst{31-27} = 0b11110;
@@ -1632,6 +1741,7 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins i32imm_hilo16:$imm), IIC_iMOVi,
   let Inst{26}    = imm{11};
   let Inst{14-12} = imm{10-8};
   let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
@@ -1639,7 +1749,7 @@ def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
 
 let Constraints = "$src = $Rd" in {
 def t2MOVTi16 : T2I<(outs rGPR:$Rd),
-                    (ins rGPR:$src, i32imm_hilo16:$imm), IIC_iMOVi,
+                    (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
                     "movt", "\t$Rd, $imm",
                     [(set rGPR:$Rd,
                           (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> {
@@ -1657,6 +1767,7 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
   let Inst{26}    = imm{11};
   let Inst{14-12} = imm{10-8};
   let Inst{7-0}   = imm{7-0};
+  let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
 def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
@@ -1671,28 +1782,26 @@ def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 // Sign extenders
 
-defm t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
                               UnOpFrag<(sext_inreg node:$Src, i8)>>;
-defm t2SXTH  : T2I_ext_rrot<0b000, "sxth",
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
                               UnOpFrag<(sext_inreg node:$Src, i16)>>;
-defm t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
 
-defm t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-defm t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
                         BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
-defm t2SXTAB16 : T2I_exta_rrot_DO<0b010, "sxtab16">;
-
-// TODO: SXT(A){B|H}16 - done for disassembly only
+def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
 
 // Zero extenders
 
 let AddedComplexity = 16 in {
-defm t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
                                UnOpFrag<(and node:$Src, 0x000000FF)>>;
-defm t2UXTH   : T2I_ext_rrot<0b001, "uxth",
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
                                UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-defm t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
+def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
                                UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
@@ -1700,17 +1809,17 @@ defm t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
 //        instead so we can include a check for masking back in the upper
 //        eight bits of the source into the lower eight bits of the result.
 //def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
-//            (t2UXTB16r_rot rGPR:$Src, 24)>,
+//            (t2UXTB16 rGPR:$Src, 3)>,
 //          Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
-            (t2UXTB16r_rot rGPR:$Src, 8)>,
+            (t2UXTB16 rGPR:$Src, 1)>,
         Requires<[HasT2ExtractPack, IsThumb2]>;
 
-defm t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-defm t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
                            BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
-defm t2UXTAB16 : T2I_exta_rrot_DO<0b011, "uxtab16">;
+def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1723,27 +1832,37 @@ defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
                              BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
 
 // ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
+//
+// Currently, t2ADDS/t2SUBS are pseudo opcodes that exist only in the
+// selection DAG. They are "lowered" to real t2ADD/t2SUB opcodes by
+// AdjustInstrPostInstrSelection where we determine whether or not to
+// set the "s" bit based on CPSR liveness.
+//
+// FIXME: Eliminate t2ADDS/t2SUBS pseudo opcodes after adding tablegen
+// support for an optional CPSR definition that corresponds to the DAG
+// node's second value. We can then eliminate the implicit def of CPSR.
 defm t2ADDS : T2I_bin_s_irs <0b1000, "add",
                              IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
+                             BinOpFrag<(ARMaddc node:$LHS, node:$RHS)>, 1>;
 defm t2SUBS : T2I_bin_s_irs <0b1101, "sub",
                              IIC_iALUi, IIC_iALUr, IIC_iALUsi,
-                             BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
+let hasPostISelHook = 1 in {
 defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
-                          BinOpFrag<(adde_dead_carry node:$LHS, node:$RHS)>, 1>;
+              BinOpWithFlagFrag<(ARMadde node:$LHS, node:$RHS, node:$FLAG)>, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
-                          BinOpFrag<(sube_dead_carry node:$LHS, node:$RHS)>>;
-defm t2ADCS : T2I_adde_sube_s_irs<BinOpFrag<(adde_live_carry node:$LHS,
-                                                             node:$RHS)>, 1>;
-defm t2SBCS : T2I_adde_sube_s_irs<BinOpFrag<(sube_live_carry node:$LHS,
-                                                             node:$RHS)>>;
+              BinOpWithFlagFrag<(ARMsube node:$LHS, node:$RHS, node:$FLAG)>>;
+}
 
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb",
                              BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// FIXME: Eliminate them if we can write def : Pat patterns which defines
+// CPSR and the implicit def of CPSR is not needed.
 defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb",
-                             BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+                             BinOpFrag<(ARMsubc node:$LHS, node:$RHS)>>;
 
 // (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
 // The assume-no-carry-in form uses the negation of the input since add/sub
@@ -1760,23 +1879,18 @@ def : T2Pat<(add        GPR:$src, t2_so_imm_neg:$imm),
 def : T2Pat<(add        GPR:$src, imm0_4095_neg:$imm),
             (t2SUBri12  GPR:$src, imm0_4095_neg:$imm)>;
 let AddedComplexity = 1 in
-def : T2Pat<(addc       rGPR:$src, imm0_255_neg:$imm),
+def : T2Pat<(ARMaddc    rGPR:$src, imm0_255_neg:$imm),
             (t2SUBSri   rGPR:$src, imm0_255_neg:$imm)>;
-def : T2Pat<(addc       rGPR:$src, t2_so_imm_neg:$imm),
+def : T2Pat<(ARMaddc    rGPR:$src, t2_so_imm_neg:$imm),
             (t2SUBSri   rGPR:$src, t2_so_imm_neg:$imm)>;
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
 let AddedComplexity = 1 in
-def : T2Pat<(adde_dead_carry       rGPR:$src, imm0_255_not:$imm),
+def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, imm0_255_not:$imm)>;
-def : T2Pat<(adde_dead_carry       rGPR:$src, t2_so_imm_not:$imm),
+def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
-let AddedComplexity = 1 in
-def : T2Pat<(adde_live_carry       rGPR:$src, imm0_255_not:$imm),
-            (t2SBCSri   rGPR:$src, imm0_255_not:$imm)>;
-def : T2Pat<(adde_live_carry       rGPR:$src, t2_so_imm_not:$imm),
-            (t2SBCSri   rGPR:$src, t2_so_imm_not:$imm)>;
 
 // Select Bytes -- for disassembly only
 
@@ -1893,8 +2007,7 @@ class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops,
   let Inst{7-4}   = op7_4;
 }
 
-// Unsigned Sum of Absolute Differences [and Accumulate] -- for disassembly only
-
+// Unsigned Sum of Absolute Differences [and Accumulate].
 def t2USAD8   : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                                            (ins rGPR:$Rn, rGPR:$Rm),
                         NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>,
@@ -1906,8 +2019,7 @@ def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
                         "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 
-// Signed/Unsigned saturate -- for disassembly only
-
+// Signed/Unsigned saturate.
 class T2SatI<dag oops, dag iops, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
@@ -1918,26 +2030,26 @@ class T2SatI<dag oops, dag iops, InstrItinClass itin,
 
   let Inst{11-8}  = Rd;
   let Inst{19-16} = Rn;
-  let Inst{4-0}   = sat_imm{4-0};
-  let Inst{21}    = sh{6};
+  let Inst{4-0}   = sat_imm;
+  let Inst{21}    = sh{5};
   let Inst{14-12} = sh{4-2};
   let Inst{7-6}   = sh{1-0};
 }
 
 def t2SSAT: T2SatI<
-              (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh",
-              [/* For disassembly only; pattern left blank */]> {
+              (outs rGPR:$Rd),
+              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
   let Inst{20} = 0;
   let Inst{15} = 0;
+  let Inst{5}  = 0;
 }
 
 def t2SSAT16: T2SatI<
-                (outs rGPR:$Rd), (ins ssat_imm:$sat_imm, rGPR:$Rn), NoItinerary,
-                "ssat16", "\t$Rd, $sat_imm, $Rn",
-                [/* For disassembly only; pattern left blank */]>,
+                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
+                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
           Requires<[IsThumb2, HasThumb2DSP]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1100;
@@ -1946,30 +2058,30 @@ def t2SSAT16: T2SatI<
   let Inst{21} = 1;        // sh = '1'
   let Inst{14-12} = 0b000; // imm3 = '000'
   let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
 }
 
 def t2USAT: T2SatI<
-                (outs rGPR:$Rd), (ins i32imm:$sat_imm, rGPR:$Rn, shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh",
-                [/* For disassembly only; pattern left blank */]> {
+               (outs rGPR:$Rd),
+               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25-22} = 0b1110;
   let Inst{20} = 0;
   let Inst{15} = 0;
 }
 
-def t2USAT16: T2SatI<(outs rGPR:$dst), (ins i32imm:$sat_imm, rGPR:$Rn),
+def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
                      NoItinerary,
-                     "usat16", "\t$dst, $sat_imm, $Rn",
-                     [/* For disassembly only; pattern left blank */]>,
+                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
           Requires<[IsThumb2, HasThumb2DSP]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1110;
+  let Inst{31-22} = 0b1111001110;
   let Inst{20} = 0;
   let Inst{15} = 0;
   let Inst{21} = 1;        // sh = '1'
   let Inst{14-12} = 0b000; // imm3 = '000'
   let Inst{7-6} = 0b00;    // imm2 = '00'
+  let Inst{5-4} = 0b00;
 }
 
 def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>;
@@ -1979,10 +2091,14 @@ def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>;
 //  Shift and rotate Instructions.
 //
 
-defm t2LSL  : T2I_sh_ir<0b00, "lsl", BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
-defm t2LSR  : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
-defm t2ASR  : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
-defm t2ROR  : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31,
+                        BinOpFrag<(shl  node:$LHS, node:$RHS)>, "t2LSL">;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,
+                        BinOpFrag<(srl  node:$LHS, node:$RHS)>, "t2LSR">;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,
+                        BinOpFrag<(sra  node:$LHS, node:$RHS)>, "t2ASR">;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31,
+                        BinOpFrag<(rotr node:$LHS, node:$RHS)>, "t2ROR">;
 
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
 def : Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
@@ -2090,7 +2206,7 @@ def t2BFC : T2BitFI<(outs rGPR:$Rd), (ins rGPR:$src, bf_inv_mask_imm:$imm),
 }
 
 def t2SBFX: T2TwoRegBitFI<
-                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb),
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
                  IIC_iUNAsi, "sbfx", "\t$Rd, $Rn, $lsb, $msb", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
@@ -2099,7 +2215,7 @@ def t2SBFX: T2TwoRegBitFI<
 }
 
 def t2UBFX: T2TwoRegBitFI<
-                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm0_31_m1:$msb),
+                (outs rGPR:$Rd), (ins rGPR:$Rn, imm0_31:$lsb, imm1_32:$msb),
                  IIC_iUNAsi, "ubfx", "\t$Rd, $Rn, $lsb, $msb", []> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
@@ -2125,26 +2241,6 @@ let Constraints = "$src = $Rd" in {
     let msb{4-0} = imm{9-5};
     let lsb{4-0} = imm{4-0};
   }
-
-  // GNU as only supports this form of bfi (w/ 4 arguments)
-  let isAsmParserOnly = 1 in
-  def t2BFI4p : T2TwoRegBitFI<(outs rGPR:$Rd),
-                  (ins rGPR:$src, rGPR:$Rn, lsb_pos_imm:$lsbit,
-                       width_imm:$width),
-                  IIC_iBITi, "bfi", "\t$Rd, $Rn, $lsbit, $width",
-                  []> {
-    let Inst{31-27} = 0b11110;
-    let Inst{26} = 0; // should be 0.
-    let Inst{25} = 1;
-    let Inst{24-20} = 0b10110;
-    let Inst{15} = 0;
-    let Inst{5} = 0; // should be 0.
-
-    bits<5> lsbit;
-    bits<5> width;
-    let msb{4-0} = width; // Custom encoder => lsb+width-1
-    let lsb{4-0} = lsbit;
-  }
 }
 
 defm t2ORN  : T2I_bin_irs<0b0011, "orn",
@@ -2152,13 +2248,53 @@ defm t2ORN  : T2I_bin_irs<0b0011, "orn",
                           BinOpFrag<(or  node:$LHS, (not node:$RHS))>,
                           "t2ORN", 0, "">;
 
+/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// unary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_un_irs<bits<4> opcod, string opc,
+                     InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
+                      PatFrag opnode, bit Cheap = 0, bit ReMat = 0> {
+   // shifted imm
+   def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
+                opc, "\t$Rd, $imm",
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
+     let isAsCheapAsAMove = Cheap;
+     let isReMaterializable = ReMat;
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15} = 0;
+   }
+   // register
+   def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
+                opc, ".w\t$Rd, $Rm",
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
+                opc, ".w\t$Rd, $ShiftedRm",
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+   }
+}
+
 // Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
 let AddedComplexity = 1 in
 defm t2MVN  : T2I_un_irs <0b0011, "mvn",
                           IIC_iMVNi, IIC_iMVNr, IIC_iMVNsi,
                           UnOpFrag<(not node:$Src)>, 1, 1>;
 
-
 let AddedComplexity = 1 in
 def : T2Pat<(and     rGPR:$src, t2_so_imm_not:$imm),
             (t2BICri rGPR:$src, t2_so_imm_not:$imm)>;
@@ -2209,9 +2345,9 @@ def t2MLS: T2FourReg<
 let neverHasSideEffects = 1 in {
 let isCommutable = 1 in {
 def t2SMULL : T2MulLong<0b000, 0b0000,
-                  (outs rGPR:$Rd, rGPR:$Ra),
+                  (outs rGPR:$RdLo, rGPR:$RdHi),
                   (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
-                   "smull", "\t$Rd, $Ra, $Rn, $Rm", []>;
+                   "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
 
 def t2UMULL : T2MulLong<0b010, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
@@ -2468,7 +2604,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
 defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
 
-// Halfword multiple accumulate long: SMLAL<x><y> -- for disassembly only
+// Halfword multiple accumulate long: SMLAL<x><y>
 def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd),
          (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm",
            [/* For disassembly only; pattern left blank */]>,
@@ -2487,8 +2623,6 @@ def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd),
           Requires<[IsThumb2, HasThumb2DSP]>;
 
 // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
-// These are for disassembly only.
-
 def t2SMUAD: T2ThreeReg_mac<
             0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm),
             IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>,
@@ -2513,7 +2647,7 @@ def t2SMUSDX:T2ThreeReg_mac<
           Requires<[IsThumb2, HasThumb2DSP]> {
   let Inst{15-12} = 0b1111;
 }
-def t2SMLAD   : T2ThreeReg_mac<
+def t2SMLAD   : T2FourReg_mac<
             0, 0b010, 0b0000, (outs rGPR:$Rd),
             (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad",
             "\t$Rd, $Rn, $Rm, $Ra", []>,
@@ -2532,20 +2666,20 @@ def t2SMLSDX  : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd),
             "\t$Rd, $Rn, $Rm, $Ra", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLALD  : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
-                        (ins rGPR:$Rm, rGPR:$Rn), IIC_iMAC64, "smlald",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
-                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlaldx",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLSLD  : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd),
-                        (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsld",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld",
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
                         (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx",
-                        "\t$Ra, $Rd, $Rm, $Rn", []>,
+                        "\t$Ra, $Rd, $Rn, $Rm", []>,
           Requires<[IsThumb2, HasThumb2DSP]>;
 
 //===----------------------------------------------------------------------===//
@@ -2613,10 +2747,10 @@ def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
             (t2REVSH rGPR:$Rm)>;
 
 def t2PKHBT : T2ThreeReg<
-            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh),
+            (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_lsl_amt:$sh),
                   IIC_iBITsi, "pkhbt", "\t$Rd, $Rn, $Rm$sh",
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
-                                      (and (shl rGPR:$Rm, lsl_amt:$sh),
+                                      (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
                                            0xFFFF0000)))]>,
                   Requires<[HasT2ExtractPack, IsThumb2]> {
   let Inst{31-27} = 0b11101;
@@ -2625,9 +2759,9 @@ def t2PKHBT : T2ThreeReg<
   let Inst{5} = 0; // BT form
   let Inst{4} = 0;
 
-  bits<8> sh;
-  let Inst{14-12} = sh{7-5};
-  let Inst{7-6}   = sh{4-3};
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
 }
 
 // Alternate cases for PKHBT where identities eliminate some nodes.
@@ -2635,16 +2769,16 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
-            (t2PKHBT rGPR:$src1, rGPR:$src2, (lsl_shift_imm imm16_31:$sh))>,
+            (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
 def t2PKHTB : T2ThreeReg<
-                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, shift_imm:$sh),
+                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, pkh_asr_amt:$sh),
                   IIC_iBITsi, "pkhtb", "\t$Rd, $Rn, $Rm$sh",
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
-                                       (and (sra rGPR:$Rm, asr_amt:$sh),
+                                       (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
                                             0xFFFF)))]>,
                   Requires<[HasT2ExtractPack, IsThumb2]> {
   let Inst{31-27} = 0b11101;
@@ -2653,19 +2787,19 @@ def t2PKHTB : T2ThreeReg<
   let Inst{5} = 1; // TB form
   let Inst{4} = 0;
 
-  bits<8> sh;
-  let Inst{14-12} = sh{7-5};
-  let Inst{7-6}   = sh{4-3};
+  bits<5> sh;
+  let Inst{14-12} = sh{4-2};
+  let Inst{7-6}   = sh{1-0};
 }
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)),
-            (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm16_31:$sh))>,
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
                 (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
-            (t2PKHTB rGPR:$src1, rGPR:$src2, (asr_shift_imm imm1_15:$sh))>,
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
@@ -2673,14 +2807,14 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
                           IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>, "t2CMP">;
 
-def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_imm:$imm),
-            (t2CMPri  GPR:$lhs, t2_so_imm:$imm)>;
-def : T2Pat<(ARMcmpZ  GPR:$lhs, rGPR:$rhs),
-            (t2CMPrr  GPR:$lhs, rGPR:$rhs)>;
-def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_reg:$rhs),
-            (t2CMPrs  GPR:$lhs, t2_so_reg:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_imm:$imm),
+            (t2CMPri  GPRnopc:$lhs, t2_so_imm:$imm)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, rGPR:$rhs),
+            (t2CMPrr  GPRnopc:$lhs, rGPR:$rhs)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$lhs, t2_so_reg:$rhs),
+            (t2CMPrs  GPRnopc:$lhs, t2_so_reg:$rhs)>;
 
 //FIXME: Disable CMN, as CCodes are backwards from compare expectations
 //       Compare-to-zero still works out, just not the relationals
@@ -2688,20 +2822,23 @@ def : T2Pat<(ARMcmpZ  GPR:$lhs, t2_so_reg:$rhs),
 //                          BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
 defm t2CMNz : T2I_cmp_irs<0b1000, "cmn",
                           IIC_iCMPi, IIC_iCMPr, IIC_iCMPsi,
-                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
+                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>,
+                          "t2CMNz">;
 
 //def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
 //            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
 
-def : T2Pat<(ARMcmpZ  GPR:$src, t2_so_imm_neg:$imm),
-            (t2CMNzri GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(ARMcmpZ  GPRnopc:$src, t2_so_imm_neg:$imm),
+            (t2CMNzri GPRnopc:$src, t2_so_imm_neg:$imm)>;
 
 defm t2TST  : T2I_cmp_irs<0b0000, "tst",
                           IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
-                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>>;
+                         BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>,
+                          "t2TST">;
 defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                           IIC_iTSTi, IIC_iTSTr, IIC_iTSTsi,
-                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
+                         BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>,
+                          "t2TEQ">;
 
 // Conditional moves
 // FIXME: should be able to write a pattern for ARMcmov, but can't use
@@ -2723,7 +2860,7 @@ def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd),
 // FIXME: Pseudo-ize these. For now, just mark codegen only.
 let isCodeGenOnly = 1 in {
 let isMoveImm = 1 in
-def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, i32imm_hilo16:$imm),
+def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm),
                       IIC_iCMOVi,
                       "movw", "\t$Rd, $imm", []>,
                       RegConstraint<"$false = $Rd"> {
@@ -2807,20 +2944,19 @@ def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
 }
 
 def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
-                  "dsb", "\t$opt",
-                  [/* For disassembly only; pattern left blank */]>,
+                  "dsb", "\t$opt", []>,
                   Requires<[IsThumb, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f4;
   let Inst{3-0} = opt;
 }
 
-// ISB has only full system option -- for disassembly only
-def t2ISB : AInoP<(outs), (ins), ThumbFrm, NoItinerary, "isb", "",
-                  [/* For disassembly only; pattern left blank */]>,
-                  Requires<[IsThumb2, HasV7]> {
+def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
+                  "isb", "\t$opt",
+                  []>, Requires<[IsThumb2, HasDB]> {
+  bits<4> opt;
   let Inst{31-4} = 0xf3bf8f6;
-  let Inst{3-0} = 0b1111;
+  let Inst{3-0} = opt;
 }
 
 class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
@@ -2858,28 +2994,27 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexb", "\t$Rt, $addr", "", []>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexh", "\t$Rt, $addr", "", []>;
-def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_reg:$addr),
+def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
                        AddrModeNone, 4, NoItinerary,
                        "ldrex", "\t$Rt, $addr", "", []> {
+  bits<4> Rt;
+  bits<12> addr;
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0000101;
-  let Inst{11-8} = 0b1111;
-  let Inst{7-0} = 0b00000000; // imm8 = 0
-
-  bits<4> Rt;
-  bits<4> addr;
-  let Inst{19-16} = addr;
+  let Inst{19-16} = addr{11-8};
   let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = addr{7-0};
 }
 let hasExtraDefRegAllocReq = 1 in
 def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
-                         (ins t2addrmode_reg:$addr),
+                         (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
                          [], {?, ?, ?, ?}> {
@@ -2890,33 +3025,33 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
 def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
-                         (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexb", "\t$Rd, $Rt, $addr", "", []>;
 def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
-                         (ins rGPR:$Rt, t2addrmode_reg:$addr),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexh", "\t$Rd, $Rt, $addr", "", []>;
-def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt, t2addrmode_reg:$addr),
+def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             t2addrmode_imm0_1020s4:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
                   []> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-20} = 0b0000100;
-  let Inst{7-0} = 0b00000000; // imm8 = 0
-
   bits<4> Rd;
-  bits<4> addr;
   bits<4> Rt;
-  let Inst{11-8}  = Rd;
-  let Inst{19-16} = addr;
+  bits<12> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000100;
+  let Inst{19-16} = addr{11-8};
   let Inst{15-12} = Rt;
+  let Inst{11-8}  = Rd;
+  let Inst{7-0} = addr{7-0};
 }
 }
 
 let hasExtraSrcRegAllocReq = 1, Constraints = "@earlyclobber $Rd" in
 def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
-                         (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_reg:$addr),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
                          {?, ?, ?, ?}> {
@@ -2924,9 +3059,7 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
   let Inst{11-8} = Rt2;
 }
 
-// Clear-Exclusive is for disassembly only.
-def t2CLREX : T2XI<(outs), (ins), NoItinerary, "clrex",
-                   [/* For disassembly only; pattern left blank */]>,
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>,
             Requires<[IsThumb2, HasV7]>  {
   let Inst{31-16} = 0xf3bf;
   let Inst{15-14} = 0b10;
@@ -2986,8 +3119,8 @@ def t2LDMIA_RET: t2PseudoExpand<(outs GPR:$wb), (ins GPR:$Rn, pred:$p,
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
-def t2B   : T2XI<(outs), (ins uncondbrtarget:$target), IIC_Br,
-                 "b.w\t$target",
+def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
+                 "b", ".w\t$target",
                  [(br bb:$target)]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
@@ -3009,15 +3142,13 @@ def t2BR_JT : t2PseudoInst<(outs),
 
 // FIXME: Add a non-pc based case that can be predicated.
 def t2TBB_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id),
-         0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
 
 def t2TBH_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id),
-         0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
 
-def t2TBB : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
-                    "tbb", "\t[$Rn, $Rm]", []> {
+def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
+                    "tbb", "\t$addr", []> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3025,10 +3156,12 @@ def t2TBB : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
   let Inst{15-5} = 0b11110000000;
   let Inst{4} = 0; // B form
   let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
 }
 
-def t2TBH : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
-                   "tbh", "\t[$Rn, $Rm, lsl #1]", []> {
+def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
+                   "tbh", "\t$addr", []> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3036,13 +3169,15 @@ def t2TBH : T2I<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_Br,
   let Inst{15-5} = 0b11110000000;
   let Inst{4} = 1; // H form
   let Inst{3-0} = Rm;
+
+  let DecoderMethod = "DecodeThumbTableBranch";
 }
 } // isNotDuplicable, isIndirectBranch
 
 } // isBranch, isTerminator, isBarrier
 
 // FIXME: should be able to write a pattern for ARMBrcond, but can't use
-// a two-value operand where a dag node expects two operands. :(
+// a two-value operand where a dag node expects ", "two operands. :(
 let isBranch = 1, isTerminator = 1 in
 def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
                 "b", ".w\t$target",
@@ -3060,6 +3195,8 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
   let Inst{13} = target{18};
   let Inst{21-16} = target{17-12};
   let Inst{10-0} = target{11-1};
+
+  let DecoderMethod = "DecodeThumb2BCCInstruction";
 }
 
 // Tail calls. The Darwin version of thumb tail calls uses a t2 branch, so
@@ -3068,9 +3205,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // Darwin version.
   let Defs = [R0, R1, R2, R3, R9, R12, QQQQ0, QQQQ2, QQQQ3, PC],
       Uses = [SP] in
-  def tTAILJMPd: tPseudoExpand<(outs), (ins uncondbrtarget:$dst, variable_ops),
+  def tTAILJMPd: tPseudoExpand<(outs),
+                   (ins uncondbrtarget:$dst, pred:$p, variable_ops),
                    4, IIC_Br, [],
-                   (t2B uncondbrtarget:$dst)>,
+                   (t2B uncondbrtarget:$dst, pred:$p)>,
                  Requires<[IsThumb2, IsDarwin]>;
 }
 
@@ -3087,30 +3225,55 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
   bits<4> mask;
   let Inst{7-4} = cc;
   let Inst{3-0} = mask;
+
+  let DecoderMethod = "DecodeIT";
 }
 
 // Branch and Exchange Jazelle -- for disassembly only
 // Rm = Inst{19-16}
-def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func",
-              [/* For disassembly only; pattern left blank */]> {
+def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []> {
+  bits<4> func;
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
   let Inst{25-20} = 0b111100;
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
-
-  bits<4> func;
   let Inst{19-16} = func;
+  let Inst{15-0} = 0b1000111100000000;
+}
+
+// Compare and branch on zero / non-zero
+let isBranch = 1, isTerminator = 1 in {
+  def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbz\t$Rn, $target", []>,
+              T1Misc<{0,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
+
+  def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
+                  "cbnz\t$Rn, $target", []>,
+              T1Misc<{1,0,?,1,?,?,?}>,
+              Requires<[IsThumb2]> {
+    // A8.6.27
+    bits<6> target;
+    bits<3> Rn;
+    let Inst{9}   = target{5};
+    let Inst{7-3} = target{4-0};
+    let Inst{2-0} = Rn;
+  }
 }
 
-// Change Processor State is a system instruction -- for disassembly and
-// parsing only.
+
+// Change Processor State is a system instruction.
 // FIXME: Since the asm parser has currently no clean way to handle optional
 // operands, create 3 versions of the same instruction. Once there's a clean
 // framework to represent optional operands, change this behavior.
 class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
-            !strconcat("cps", asm_op),
-            [/* For disassembly only; pattern left blank */]> {
+            !strconcat("cps", asm_op), []> {
   bits<2> imod;
   bits<3> iflags;
   bits<5> mode;
@@ -3126,6 +3289,7 @@ class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
   let Inst{8}     = M;
   let Inst{7-5}   = iflags;
   let Inst{4-0}   = mode;
+  let DecoderMethod = "DecodeT2CPSInstruction";
 }
 
 let M = 1 in
@@ -3135,14 +3299,12 @@ let mode = 0, M = 0 in
   def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
                       "$imod.w\t$iflags">;
 let imod = 0, iflags = 0, M = 1 in
-  def t2CPS1p : t2CPS<(ins i32imm:$mode), "\t$mode">;
+  def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
 
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-// Helper class for disassembly only.
 class T2I_hint<bits<8> op7_0, string opc, string asm>
-  : T2I<(outs), (ins), NoItinerary, opc, asm,
-        [/* For disassembly only; pattern left blank */]> {
+  : T2I<(outs), (ins), NoItinerary, opc, asm, []> {
   let Inst{31-20} = 0xf3a;
   let Inst{19-16} = 0b1111;
   let Inst{15-14} = 0b10;
@@ -3158,20 +3320,17 @@ def t2WFI   : T2I_hint<0b00000011, "wfi",   ".w">;
 def t2SEV   : T2I_hint<0b00000100, "sev",   ".w">;
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
-  let Inst{31-20} = 0xf3a;
-  let Inst{15-14} = 0b10;
-  let Inst{12} = 0;
-  let Inst{10-8} = 0b000;
-  let Inst{7-4} = 0b1111;
-
   bits<4> opt;
+  let Inst{31-20} = 0b111100111010;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-8} = 0b10000000;
+  let Inst{7-4} = 0b1111;
   let Inst{3-0} = opt;
 }
 
-// Secure Monitor Call is a system instruction -- for disassembly only
+// Secure Monitor Call is a system instruction.
 // Option = Inst{19-16}
-def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
-                [/* For disassembly only; pattern left blank */]> {
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", []> {
   let Inst{31-27} = 0b11110;
   let Inst{26-20} = 0b1111111;
   let Inst{15-12} = 0b1000;
@@ -3180,32 +3339,30 @@ def t2SMC : T2I<(outs), (ins i32imm:$opt), NoItinerary, "smc", "\t$opt",
   let Inst{19-16} = opt;
 }
 
-class T2SRS<bits<12> op31_20,
-           dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
+class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
-  let Inst{31-20} = op31_20{11-0};
-
   bits<5> mode;
+  let Inst{31-25} = 0b1110100;
+  let Inst{24-23} = Op;
+  let Inst{22} = 0;
+  let Inst{21} = W;
+  let Inst{20-16} = 0b01101;
+  let Inst{15-5} = 0b11000000000;
   let Inst{4-0} = mode{4-0};
 }
 
-// Store Return State is a system instruction -- for disassembly only
-def t2SRSDBW : T2SRS<0b111010000010,
-                   (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp!, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-def t2SRSDB  : T2SRS<0b111010000000,
-                   (outs),(ins i32imm:$mode),NoItinerary,"srsdb","\tsp, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-def t2SRSIAW : T2SRS<0b111010011010,
-                   (outs),(ins i32imm:$mode),NoItinerary,"srsia","\tsp!, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-def t2SRSIA  : T2SRS<0b111010011000,
-                   (outs), (ins i32imm:$mode),NoItinerary,"srsia","\tsp, $mode",
-                   [/* For disassembly only; pattern left blank */]>;
-
-// Return From Exception is a system instruction -- for disassembly only
+// Store Return State is a system instruction.
+def t2SRSDB_UPD : T2SRS<0b00, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsdb", "\tsp!, $mode", []>;
+def t2SRSDB  : T2SRS<0b00, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsdb","\tsp, $mode", []>;
+def t2SRSIA_UPD : T2SRS<0b11, 1, (outs), (ins imm0_31:$mode), NoItinerary,
+                        "srsia","\tsp!, $mode", []>;
+def t2SRSIA  : T2SRS<0b11, 0, (outs), (ins imm0_31:$mode), NoItinerary,
+                     "srsia","\tsp, $mode", []>;
 
+// Return From Exception is a system instruction.
 class T2RFE<bits<12> op31_20, dag oops, dag iops, InstrItinClass itin,
           string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
@@ -3277,53 +3434,186 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
 
+// Pseudo isntruction that combines movs + predicated rsbmi 
+// to implement integer ABS
+let usesCustomInserter = 1, Defs = [CPSR] in {
+def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
+                       NoItinerary, []>, Requires<[IsThumb2]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Coprocessor load/store -- for disassembly only
+//
+class T2CI<bits<4> op31_28, dag oops, dag iops, string opc, string asm>
+  : T2I<oops, iops, NoItinerary, opc, asm, []> {
+  let Inst{31-28} = op31_28;
+  let Inst{27-25} = 0b110;
+}
+
+multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
+  def _OFFSET : T2CI<op31_28,
+                     (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                     asm, "\t$cop, $CRd, $addr"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _PRE : T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                  asm, "\t$cop, $CRd, $addr!"> {
+    bits<13> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 1; // P = 1
+    let Inst{23} = addr{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr{12-9};
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = addr{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _POST: T2CI<op31_28,
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                               postidx_imm8s4:$offset),
+                 asm, "\t$cop, $CRd, $addr, $offset"> {
+    bits<9> offset;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = offset{8};
+    let Inst{22} = Dbit;
+    let Inst{21} = 1; // W = 1
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = offset{7-0};
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+  def _OPTION : T2CI<op31_28, (outs),
+                     (ins p_imm:$cop, c_imm:$CRd, addr_offset_none:$addr,
+                          coproc_option_imm:$option),
+      asm, "\t$cop, $CRd, $addr, $option"> {
+    bits<8> option;
+    bits<4> addr;
+    bits<4> cop;
+    bits<4> CRd;
+    let Inst{24} = 0; // P = 0
+    let Inst{23} = 1; // U = 1
+    let Inst{22} = Dbit;
+    let Inst{21} = 0; // W = 0
+    let Inst{20} = load;
+    let Inst{19-16} = addr;
+    let Inst{15-12} = CRd;
+    let Inst{11-8} = cop;
+    let Inst{7-0} = option;
+    let DecoderMethod = "DecodeCopMemInstruction";
+  }
+}
+
+defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
+defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
+defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
+defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
+
+
 //===----------------------------------------------------------------------===//
 // Move between special register and ARM core register -- for disassembly only
 //
+// Move to ARM core register from Special Register
 
-class T2SpecialReg<bits<12> op31_20, bits<2> op15_14, bits<1> op12,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
-  let Inst{31-20} = op31_20{11-0};
-  let Inst{15-14} = op15_14{1-0};
-  let Inst{12} = op12{0};
+// A/R class MRS.
+//
+// A/R class can only move from CPSR or SPSR.
+def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr", []>,
+               Requires<[IsThumb2,IsARClass]> {
+  bits<4> Rd;
+  let Inst{31-12} = 0b11110011111011111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
 }
 
-class T2MRS<bits<12> op31_20, bits<2> op15_14, bits<1> op12,
-          dag oops, dag iops, InstrItinClass itin,
-          string opc, string asm, list<dag> pattern>
-  : T2SpecialReg<op31_20, op15_14, op12, oops, iops, itin, opc, asm, pattern> {
+def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
+
+def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr", []>,
+                 Requires<[IsThumb2,IsARClass]> {
   bits<4> Rd;
+  let Inst{31-12} = 0b11110011111111111000;
+  let Inst{11-8} = Rd;
+  let Inst{7-0} = 0b0000;
+}
+
+// M class MRS.
+//
+// This MRS has a mask field in bits 7-0 and can take more values than
+// the A/R class (a full msr_mask).
+def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
+                  "mrs", "\t$Rd, $mask", []>,
+              Requires<[IsThumb2,IsMClass]> {
+  bits<4> Rd;
+  bits<8> mask;
+  let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
   let Inst{19-16} = 0b1111;
+  let Inst{7-0} = mask;
 }
 
-def t2MRS : T2MRS<0b111100111110, 0b10, 0,
-                (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, cpsr",
-                [/* For disassembly only; pattern left blank */]>;
-def t2MRSsys : T2MRS<0b111100111111, 0b10, 0,
-                   (outs rGPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
-                   [/* For disassembly only; pattern left blank */]>;
 
 // Move from ARM core register to Special Register
 //
+// A/R class MSR.
+//
 // No need to have both system and application versions, the encodings are the
 // same and the assembly parser has no way to distinguish between them. The mask
 // operand contains the special register (R Bit) in bit 4 and bits 3-0 contains
 // the mask with the fields to be accessed in the special register.
-def t2MSR : T2SpecialReg<0b111100111000 /* op31-20 */, 0b10 /* op15-14 */,
-                         0 /* op12 */, (outs), (ins msr_mask:$mask, rGPR:$Rn),
-                         NoItinerary, "msr", "\t$mask, $Rn",
-                         [/* For disassembly only; pattern left blank */]> {
+def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
+                   NoItinerary, "msr", "\t$mask, $Rn", []>,
+               Requires<[IsThumb2,IsARClass]> {
   bits<5> mask;
   bits<4> Rn;
-  let Inst{19-16} = Rn;
+  let Inst{31-21} = 0b11110011100;
   let Inst{20}    = mask{4}; // R Bit
-  let Inst{13}    = 0b0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
   let Inst{11-8}  = mask{3-0};
+  let Inst{7-0}   = 0;
 }
 
+// M class MSR.
+//
+// Move from ARM core register to Special Register
+def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
+                  NoItinerary, "msr", "\t$SYSm, $Rn", []>,
+              Requires<[IsThumb2,IsMClass]> {
+  bits<8> SYSm;
+  bits<4> Rn;
+  let Inst{31-21} = 0b11110011100;
+  let Inst{20}    = 0b0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = 0b1000;
+  let Inst{7-0}  = SYSm;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register
 //
@@ -3389,13 +3679,12 @@ def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
 
 /* from coprocessor to ARM core register */
 def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
-           (outs GPR:$Rt),
-           (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn, c_imm:$CRm, i32imm:$opc2),
-           []>;
+             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
 
 def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
-             (outs GPR:$Rt), (ins p_imm:$cop, i32imm:$opc1, c_imm:$CRn,
-                                  c_imm:$CRm, i32imm:$opc2), []>;
+             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []>;
 
 def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
               (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
@@ -3465,3 +3754,269 @@ def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
 }
+
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// SXT/UXT with no rotate
+let AddedComplexity = 16 in {
+def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
+            (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
+            (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+}
+
+def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
+           Requires<[IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
+           Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// Atomic load/store patterns
+def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_negimm8:$addr),
+            (t2LDRBi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_8   t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_negimm8:$addr),
+            (t2LDRHi8   t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_16  t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_imm12:$addr),
+            (t2LDRi12   t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_negimm8:$addr),
+            (t2LDRi8    t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_load_32  t2addrmode_so_reg:$addr),
+            (t2LDRs     t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRBi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRBi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_8  t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRBs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRHi12  GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRHi8   GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_16 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRHs    GPR:$val, t2addrmode_so_reg:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_imm12:$addr, GPR:$val),
+            (t2STRi12   GPR:$val, t2addrmode_imm12:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
+            (t2STRi8    GPR:$val, t2addrmode_negimm8:$addr)>;
+def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
+            (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
+
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases
+//
+
+// Aliases for ADC without the ".w" optional width specifier.
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $Rm",
+                  (t2ADCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"adc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for SBC without the ".w" optional width specifier.
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $Rm",
+                  (t2SBCrr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SBCrs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for ADD without the ".w" optional width specifier.
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+           (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
+              (t2ADDrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"add${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2ADDrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Aliases for SUB without the ".w" optional width specifier.
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $imm",
+        (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${p} $Rd, $Rn, $imm",
+           (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $Rm",
+              (t2SUBrr GPRnopc:$Rd, GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"sub${s}${p} $Rd, $Rn, $ShiftedRm",
+                  (t2SUBrs GPRnopc:$Rd, GPRnopc:$Rn, t2_so_reg:$ShiftedRm,
+                           pred:$p, cc_out:$s)>;
+
+// Alias for compares without the ".w" optional width specifier.
+def : t2InstAlias<"cmn${p} $Rn, $Rm",
+                  (t2CMNzrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"teq${p} $Rn, $Rm",
+                  (t2TEQrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"tst${p} $Rn, $Rm",
+                  (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
+
+// Memory barriers
+def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb2, HasDB]>;
+
+// Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"ldr${p} $Rt, $addr",
+                  (t2LDRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrb${p} $Rt, $addr",
+                  (t2LDRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrh${p} $Rt, $addr",
+                  (t2LDRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsb${p} $Rt, $addr",
+                  (t2LDRSBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"ldrsh${p} $Rt, $addr",
+                  (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Alias for MVN without the ".w" optional width specifier.
+def : t2InstAlias<"mvn${s}${p} $Rd, $Rm",
+           (t2MVNr rGPR:$Rd, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
+           (t2MVNs rGPR:$Rd, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>;
+
+// PKHBT/PKHTB with default shift amount. PKHTB is equivalent to PKHBT when the
+// shift amount is zero (i.e., unspecified).
+def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
+                (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+
+// PUSH/POP aliases for STM/LDM
+def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"push${p} $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p}.w $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+def : t2InstAlias<"pop${p} $regs", (t2LDMIA_UPD SP, pred:$p, reglist:$regs)>;
+
+// Alias for REV/REV16/REVSH without the ".w" optional width specifier.
+def : t2InstAlias<"rev${p} $Rd, $Rm", (t2REV rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"rev16${p} $Rd, $Rm", (t2REV16 rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+def : t2InstAlias<"revsh${p} $Rd, $Rm", (t2REVSH rGPR:$Rd, rGPR:$Rm, pred:$p)>;
+
+
+// Alias for RSB without the ".w" optional width specifier, and with optional
+// implied destination register.
+def : t2InstAlias<"rsb${s}${p} $Rd, $Rn, $imm",
+           (t2RSBri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $imm",
+           (t2RSBri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $Rm",
+           (t2RSBrr rGPR:$Rdn, rGPR:$Rdn, rGPR:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"rsb${s}${p} $Rdn, $ShiftedRm",
+           (t2RSBrs rGPR:$Rdn, rGPR:$Rdn, t2_so_reg:$ShiftedRm, pred:$p,
+                    cc_out:$s)>;
+
+// SSAT/USAT optional shift operand.
+def : t2InstAlias<"ssat${p} $Rd, $sat_imm, $Rn",
+                  (t2SSAT rGPR:$Rd, imm1_32:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+def : t2InstAlias<"usat${p} $Rd, $sat_imm, $Rn",
+                  (t2USAT rGPR:$Rd, imm0_31:$sat_imm, rGPR:$Rn, 0, pred:$p)>;
+
+// STM w/o the .w suffix.
+def : t2InstAlias<"stm${p} $Rn, $regs",
+                  (t2STMIA GPR:$Rn, pred:$p, reglist:$regs)>;
+
+// Alias for STR, STRB, and STRH without the ".w" optional
+// width specifier.
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRi12 GPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHi12 rGPR:$Rt, t2addrmode_imm12:$addr, pred:$p)>;
+
+def : t2InstAlias<"str${p} $Rt, $addr",
+                  (t2STRs GPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strb${p} $Rt, $addr",
+                  (t2STRBs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+def : t2InstAlias<"strh${p} $Rt, $addr",
+                  (t2STRHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
+
+// Extend instruction optional rotate operand.
+def : t2InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
+                (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
+                (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
+                (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm",
+                (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxtb${p}.w $Rd, $Rm",
+                (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
+                (t2SXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
+                (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
+                (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
+                (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb${p} $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm",
+                (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+def : t2InstAlias<"uxtb${p}.w $Rd, $Rm",
+                (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+def : t2InstAlias<"uxth${p}.w $Rd, $Rm",
+                (t2UXTH rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
+
+// Extend instruction w/o the ".w" optional width specifier.
+def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
+                  (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxtb16${p} $Rd, $Rm$rot",
+                  (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
+                  (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+
+def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
+                  (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxtb16${p} $Rd, $Rm$rot",
+                  (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
+def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
+                  (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index f1f3cb9..e746cf2 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -31,18 +31,34 @@ def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
 // Operand Definitions.
 //
 
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "parseFPImm";
+}
+
 def vfp_f32imm : Operand<f32>,
                  PatLeaf<(f32 fpimm), [{
-      return ARM::getVFPf32Imm(N->getValueAPF()) != -1;
-    }]> {
-  let PrintMethod = "printVFPf32ImmOperand";
+      return ARM_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
 }
 
 def vfp_f64imm : Operand<f64>,
                  PatLeaf<(f64 fpimm), [{
-      return ARM::getVFPf64Imm(N->getValueAPF()) != -1;
-    }]> {
-  let PrintMethod = "printVFPf64ImmOperand";
+      return ARM_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = ARM_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
 }
 
 
@@ -385,26 +401,26 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 // Between half-precision and single-precision.  For disassembly only.
 
 // FIXME: Verify encoding after integrated assembler is working.
-def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$dst, $a",
+def VCVTBSH: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f32_to_f16 SPR:$a),
              (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
-def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$dst, $a",
+def VCVTBHS: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
 def : ARMPat<(f16_to_f32 GPR:$a),
              (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
-def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$dst, $a",
+def VCVTTSH: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
-def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
-                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$dst, $a",
+def VCVTTHS: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
+                 /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
 def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
@@ -511,14 +527,25 @@ def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
 }
 
 def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
-                      (outs GPR:$wb, GPR:$dst2), (ins SPR:$src1, SPR:$src2),
-                 IIC_fpMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2",
+                      (outs GPR:$Rt, GPR:$Rt2), (ins SPR:$src1, SPR:$src2),
+                 IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $src1, $src2",
                  [/* For disassembly only; pattern left blank */]> {
+  bits<5> src1;
+  bits<4> Rt;
+  bits<4> Rt2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = src1{3-0};
+  let Inst{5}     = src1{4};
+  let Inst{15-12} = Rt;
+  let Inst{19-16} = Rt2;
+
   let Inst{7-6} = 0b00;
 
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
+  let DecoderMethod = "DecodeVMOVRRS";
 }
 } // neverHasSideEffects
 
@@ -552,11 +579,24 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
                      (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
                 IIC_fpMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
                 [/* For disassembly only; pattern left blank */]> {
+  // Instruction operands.
+  bits<5> dst1;
+  bits<4> src1;
+  bits<4> src2;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = dst1{3-0};
+  let Inst{5}     = dst1{4};
+  let Inst{15-12} = src1;
+  let Inst{19-16} = src2;
+
   let Inst{7-6} = 0b00;
 
   // Some single precision VFP instructions may be executed on both NEON and VFP
   // pipelines.
   let D = VFPNeonDomain;
+
+  let DecoderMethod = "DecodeVMOVSRR";
 }
 
 // FMRDH: SPR -> GPR
@@ -1084,45 +1124,42 @@ def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
                     VFPMiscFrm, IIC_fpUNA64,
                     "vmov", ".f64\t$Dd, $imm",
                     [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
-  // Instruction operands.
-  bits<5>  Dd;
-  bits<32> imm;
-
-  // Encode instruction operands.
-  let Inst{15-12} = Dd{3-0};
-  let Inst{22}    = Dd{4};
-  let Inst{19}    = imm{31};
-  let Inst{18-16} = imm{22-20};
-  let Inst{3-0}   = imm{19-16};
+  bits<5> Dd;
+  bits<8> imm;
 
-  // Encode remaining instruction bits.
   let Inst{27-23} = 0b11101;
+  let Inst{22}    = Dd{4};
   let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Dd{3-0};
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 1;          // Double precision.
   let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
 }
 
 def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
                      VFPMiscFrm, IIC_fpUNA32,
                      "vmov", ".f32\t$Sd, $imm",
                      [(set SPR:$Sd, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
-  // Instruction operands.
-  bits<5>  Sd;
-  bits<32> imm;
-
-  // Encode instruction operands.
-  let Inst{15-12} = Sd{4-1};
-  let Inst{22}    = Sd{0};
-  let Inst{19}    = imm{31};    // The immediate is handled as a double.
-  let Inst{18-16} = imm{22-20};
-  let Inst{3-0}   = imm{19-16};
+  bits<5> Sd;
+  bits<8> imm;
 
-  // Encode remaining instruction bits.
   let Inst{27-23} = 0b11101;
+  let Inst{22}    = Sd{0};
   let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{7-4};
+  let Inst{15-12} = Sd{4-1};
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 0;          // Single precision.
   let Inst{7-4}   = 0b0000;
+  let Inst{3-0}   = imm{3-0};
 }
 }
+
+//===----------------------------------------------------------------------===//
+// Assembler aliases.
+//
+
+def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
+
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c6efea1..faa8ba7 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -14,10 +14,10 @@
 
 #define DEBUG_TYPE "arm-ldst-opt"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMRegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -763,9 +764,9 @@ static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc,
                                              ARM_AM::AddrOpc Mode) {
   switch (Opc) {
   case ARM::LDRi12:
-    return ARM::LDR_PRE;
+    return ARM::LDR_PRE_IMM;
   case ARM::STRi12:
-    return ARM::STR_PRE;
+    return ARM::STR_PRE_IMM;
   case ARM::VLDRS:
     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
   case ARM::VLDRD:
@@ -789,9 +790,9 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
                                               ARM_AM::AddrOpc Mode) {
   switch (Opc) {
   case ARM::LDRi12:
-    return ARM::LDR_POST;
+    return ARM::LDR_POST_IMM;
   case ARM::STRi12:
-    return ARM::STR_POST;
+    return ARM::STR_POST_IMM;
   case ARM::VLDRS:
     return Mode == ARM_AM::add ? ARM::VLDMSIA_UPD : ARM::VLDMSDB_UPD;
   case ARM::VLDRD:
@@ -892,12 +893,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
   if (!DoMerge)
     return false;
 
-  unsigned Offset = 0;
-  if (isAM2)
-    Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
-  else if (!isAM5)
-    Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
-
   if (isAM5) {
     // VLDM[SD}_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
@@ -911,28 +906,44 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
       .addReg(MO.getReg(), (isLd ? getDefRegState(true) :
                             getKillRegState(MO.isKill())));
   } else if (isLd) {
-    if (isAM2)
-      // LDR_PRE, LDR_POST,
-      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
-        .addReg(Base, RegState::Define)
-        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
-    else
+    if (isAM2) {
+      // LDR_PRE, LDR_POST
+      if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) {
+        int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+      } else {
+        int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+        BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+          .addReg(Base, RegState::Define)
+          .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+      }
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2LDR_PRE, t2LDR_POST
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
         .addReg(Base, RegState::Define)
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
   } else {
     MachineOperand &MO = MI->getOperand(0);
-    if (isAM2)
+    // FIXME: post-indexed stores use am2offset_imm, which still encodes
+    // the vestigal zero-reg offset register. When that's fixed, this clause
+    // can be removed entirely.
+    if (isAM2 && NewOpc == ARM::STR_POST_IMM) {
+      int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
       // STR_PRE, STR_POST
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
-    else
+    } else {
+      int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
       // t2STR_PRE, t2STR_POST
       BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
         .addReg(MO.getReg(), getKillRegState(MO.isKill()))
         .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+    }
   }
   MBB.erase(MBBI);
 
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index 7411b59..daa126d 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -14,7 +14,7 @@
 
 #include "ARM.h"
 #include "ARMAsmPrinter.h"
-#include "ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/Constants.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/MC/MCExpr.h"
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 76eb496..036822d 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -182,8 +182,10 @@ def QQQQ3 : ARMReg<3, "qqqq3", [QQ6, QQ7]>;
 
 // Current Program Status Register.
 def CPSR    : ARMReg<0, "cpsr">;
-def FPSCR   : ARMReg<1, "fpscr">;
-def ITSTATE : ARMReg<2, "itstate">;
+def APSR    : ARMReg<1, "apsr">;
+def SPSR    : ARMReg<2, "spsr">;
+def FPSCR   : ARMReg<3, "fpscr">;
+def ITSTATE : ARMReg<4, "itstate">;
 
 // Special Registers - only available in privileged mode.
 def FPSID   : ARMReg<0, "fpsid">;
@@ -213,6 +215,23 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
   }];
 }
 
+// GPRs without the PC.  Some ARM instructions do not allow the PC in
+// certain operand slots, particularly as the destination.  Primarily
+// useful for disassembly.
+def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
+// GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
+// implied SP argument list.
+// FIXME: It would be better to not use this at all and refactor the
+// instructions to not have SP an an explicit argument. That makes
+// frame index resolution a bit trickier, though.
+def GPRsp : RegisterClass<"ARM", [i32], 32, (add SP)>;
+
 // restricted GPR register class. Many Thumb2 instructions allow the full
 // register range for operands, but have undefined behaviours when PC
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
@@ -328,5 +347,6 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (sequence "QQQQ%u", 0, 3)> {
 
 // Condition code registers.
 def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
   let isAllocatable = 0;
 }
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index ef0aaf2..a3a3d58 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -138,13 +138,12 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
 // Adjust parameters for memset, EABI uses format (ptr, size, value),
 // GNU library uses (ptr, value, size)
 // See RTABI section 4.3.4
-SDValue
-ARMSelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
-                                             SDValue Chain, SDValue Dst,
-                                             SDValue Src, SDValue Size,
-                                             unsigned Align, bool isVolatile,
-                                             MachinePointerInfo DstPtrInfo) const
-{
+SDValue ARMSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                        SDValue Chain, SDValue Dst,
+                        SDValue Src, SDValue Size,
+                        unsigned Align, bool isVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
   // Use default for non AAPCS subtargets
   if (!Subtarget->isAAPCS_ABI())
     return SDValue();
@@ -155,7 +154,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   TargetLowering::ArgListEntry Entry;
 
   // First argument: data pointer
-  const Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
   Entry.Node = Dst;
   Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index ec1bf5c..6419a73 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -14,10 +14,27 @@
 #ifndef ARMSELECTIONDAGINFO_H
 #define ARMSELECTIONDAGINFO_H
 
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 
 namespace llvm {
 
+namespace ARM_AM {
+  static inline ShiftOpc getShiftOpcForNode(unsigned Opcode) {
+    switch (Opcode) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+}  // end namespace ARM_AM
+
 class ARMSelectionDAGInfo : public TargetSelectionDAGInfo {
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 1cab9e4..247d6be 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -53,11 +53,14 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , HasVMLxForwarding(false)
   , SlowFPBrcc(false)
   , InThumbMode(false)
+  , InNaClMode(false)
   , HasThumb2(false)
+  , IsMClass(false)
   , NoARM(false)
   , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
   , UseMovt(false)
+  , SupportsTailCall(false)
   , HasFP16(false)
   , HasD16(false)
   , HasHardwareDivide(false)
@@ -111,6 +114,8 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   else {
     IsR9Reserved = ReserveR9 | !HasV6Ops;
     UseMovt = DarwinUseMOVT && hasV6T2Ops();
+    const Triple &T = getTargetTriple();
+    SupportsTailCall = T.getOS() == Triple::IOS && !T.isOSVersionLT(5, 0);
   }
 
   if (!isThumb() || hasThumb2())
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index c650872..b63e108 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -70,9 +70,16 @@ protected:
   /// InThumbMode - True if compiling for Thumb, false for ARM.
   bool InThumbMode;
 
+  /// InNaClMode - True if targeting Native Client
+  bool InNaClMode;
+
   /// HasThumb2 - True if Thumb2 instructions are supported.
   bool HasThumb2;
 
+  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs - 
+  /// v6m, v7m for example.
+  bool IsMClass;
+
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM;
 
@@ -86,6 +93,11 @@ protected:
   /// imms (including global addresses).
   bool UseMovt;
 
+  /// SupportsTailCall - True if the OS supports tail call. The dynamic linker
+  /// must be able to synthesize call stubs for interworking between ARM and
+  /// Thumb.
+  bool SupportsTailCall;
+
   /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF
   /// only so far)
   bool HasFP16;
@@ -209,6 +221,9 @@ protected:
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+  bool isTargetNaCl() const {
+    return TargetTriple.getOS() == Triple::NativeClient;
+  }
   bool isTargetELF() const { return !isTargetDarwin(); }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
@@ -218,10 +233,13 @@ protected:
   bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
   bool isThumb2() const { return InThumbMode && HasThumb2; }
   bool hasThumb2() const { return HasThumb2; }
+  bool isMClass() const { return IsMClass; }
+  bool isARClass() const { return !IsMClass; }
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
   bool useMovt() const { return UseMovt && hasV6T2Ops(); }
+  bool supportsTailCall() const { return SupportsTailCall; }
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index f0b176a..96b1e89 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -15,77 +15,50 @@
 #include "ARM.h"
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-// This is duplicated code. Refactor this.
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-
-  if (TheTriple.isOSWindows()) {
-    llvm_unreachable("ARM does not support Windows COFF format");
-    return NULL;
-  }
-
-  return createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
-}
+static cl::opt<bool>
+EnableGlobalMerge("global-merge", cl::Hidden,
+                  cl::desc("Enable global merge pass"),
+                  cl::init(true));
 
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
   RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
-
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
-  TargetRegistry::RegisterCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterAsmBackend(TheARMTarget, createARMAsmBackend);
-  TargetRegistry::RegisterAsmBackend(TheThumbTarget, createARMAsmBackend);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterObjectStreamer(TheARMTarget, createMCStreamer);
-  TargetRegistry::RegisterObjectStreamer(TheThumbTarget, createMCStreamer);
-
 }
 
 /// TargetMachine ctor - Create an ARM architecture model.
 ///
-ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
-                                           const std::string &TT,
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
+                                           StringRef CPU, StringRef FS,
+                                           Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     JITInfo(),
     InstrItins(Subtarget.getInstrItineraryData()) {
-  DefRelocModel = getRelocationModel();
-
   // Default to soft float ABI
   if (FloatABIType == FloatABI::Default)
     FloatABIType = FloatABI::Soft;
 }
 
-ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS)
-  : ARMBaseTargetMachine(T, TT, CPU, FS), InstrInfo(Subtarget),
+ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM), InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
-                           "v128:32:128-v64:32:64-n32") :
+                           "v128:32:128-v64:32:64-n32-S32") :
+               Subtarget.isAAPCS_ABI() ?
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "v128:64:128-v64:64:64-n32-S64") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
-                           "v128:64:128-v64:64:64-n32")),
+                           "v128:64:128-v64:64:64-n32-S32")),
     ELFWriterInfo(*this),
     TLInfo(*this),
     TSInfo(*this),
@@ -95,20 +68,24 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
                        "support ARM mode execution!");
 }
 
-ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
-                                       const std::string &CPU,
-                                       const std::string &FS)
-  : ARMBaseTargetMachine(T, TT, CPU, FS),
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM),
     InstrInfo(Subtarget.hasThumb2()
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
-                           "v128:32:128-v64:32:64-a:0:32-n32") :
+                           "v128:32:128-v64:32:64-a:0:32-n32-S32") :
+               Subtarget.isAAPCS_ABI() ?
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "i16:16:32-i8:8:32-i1:8:32-"
+                           "v128:64:128-v64:64:64-a:0:32-n32-S64") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
-                           "v128:64:128-v64:64:64-a:0:32-n32")),
+                           "v128:64:128-v64:64:64-a:0:32-n32-S32")),
     ELFWriterInfo(*this),
     TLInfo(*this),
     TSInfo(*this),
@@ -117,10 +94,9 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
               : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
 }
 
-// Pass Pipeline Configuration
 bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
-  if (OptLevel != CodeGenOpt::None)
+  if (OptLevel != CodeGenOpt::None && EnableGlobalMerge)
     PM.add(createARMGlobalMergePass(getTargetLowering()));
 
   return false;
@@ -139,7 +115,6 @@ bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
     PM.add(createARMLoadStoreOptimizationPass(true));
   if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9())
     PM.add(createMLxExpansionPass());
-
   return true;
 }
 
@@ -150,7 +125,7 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
     if (!Subtarget.isThumb1Only())
       PM.add(createARMLoadStoreOptimizationPass());
     if (Subtarget.hasNEON())
-      PM.add(createNEONMoveFixPass());
+      PM.add(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
@@ -179,10 +154,6 @@ bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
 bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                           CodeGenOpt::Level OptLevel,
                                           JITCodeEmitter &JCE) {
-  // FIXME: Move this to TargetJITInfo!
-  if (DefRelocModel == Reloc::Default)
-    setRelocationModel(Reloc::Static);
-
   // Machine code emitter pass for ARM.
   PM.add(createARMJITCodeEmitterPass(*this, JCE));
   return false;
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index bc3d46a..c8c601c 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -37,11 +37,11 @@ protected:
 private:
   ARMJITInfo          JITInfo;
   InstrItineraryData  InstrItins;
-  Reloc::Model        DefRelocModel;    // Reloc model before it's overridden.
 
 public:
-  ARMBaseTargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  ARMBaseTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 
   virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
   virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
@@ -69,8 +69,9 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
   ARMSelectionDAGInfo TSInfo;
   ARMFrameLowering    FrameLowering;
  public:
-  ARMTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS);
+  ARMTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM);
 
   virtual const ARMRegisterInfo  *getRegisterInfo() const {
     return &InstrInfo.getRegisterInfo();
@@ -108,8 +109,9 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   // Either Thumb1FrameLowering or ARMFrameLowering.
   OwningPtr<ARMFrameLowering> FrameLowering;
 public:
-  ThumbTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  ThumbTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 
   /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo
   virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
diff --git a/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp b/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
index d9a5fa2..14d35ba 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmLexer.cpp
@@ -7,16 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMTargetMachine.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
 
-#include "llvm/Target/TargetAsmLexer.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -30,7 +29,7 @@ using namespace llvm;
 
 namespace {
 
-class ARMBaseAsmLexer : public TargetAsmLexer {
+class ARMBaseAsmLexer : public MCTargetAsmLexer {
   const MCAsmInfo &AsmInfo;
 
   const AsmToken &lexDefinite() {
@@ -43,7 +42,7 @@ protected:
 
   rmap_ty RegisterMap;
 
-  void InitRegisterMap(const TargetRegisterInfo *info) {
+  void InitRegisterMap(const MCRegisterInfo *info) {
     unsigned numRegs = info->getNumRegs();
 
     for (unsigned i = 0; i < numRegs; ++i) {
@@ -77,33 +76,23 @@ protected:
   }
 public:
   ARMBaseAsmLexer(const Target &T, const MCAsmInfo &MAI)
-    : TargetAsmLexer(T), AsmInfo(MAI) {
+    : MCTargetAsmLexer(T), AsmInfo(MAI) {
   }
 };
 
 class ARMAsmLexer : public ARMBaseAsmLexer {
 public:
-  ARMAsmLexer(const Target &T, const MCAsmInfo &MAI)
+  ARMAsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI)
     : ARMBaseAsmLexer(T, MAI) {
-    std::string tripleString("arm-unknown-unknown");
-    std::string featureString;
-    std::string CPU;
-    OwningPtr<const TargetMachine>
-      targetMachine(T.createTargetMachine(tripleString, CPU, featureString));
-    InitRegisterMap(targetMachine->getRegisterInfo());
+    InitRegisterMap(&MRI);
   }
 };
 
 class ThumbAsmLexer : public ARMBaseAsmLexer {
 public:
-  ThumbAsmLexer(const Target &T, const MCAsmInfo &MAI)
+  ThumbAsmLexer(const Target &T, const MCRegisterInfo &MRI,const MCAsmInfo &MAI)
     : ARMBaseAsmLexer(T, MAI) {
-    std::string tripleString("thumb-unknown-unknown");
-    std::string featureString;
-    std::string CPU;
-    OwningPtr<const TargetMachine>
-      targetMachine(T.createTargetMachine(tripleString, CPU, featureString));
-    InitRegisterMap(targetMachine->getRegisterInfo());
+    InitRegisterMap(&MRI);
   }
 };
 
@@ -149,6 +138,6 @@ AsmToken ARMBaseAsmLexer::LexTokenUAL() {
 }
 
 extern "C" void LLVMInitializeARMAsmLexer() {
-  RegisterAsmLexer<ARMAsmLexer> X(TheARMTarget);
-  RegisterAsmLexer<ThumbAsmLexer> Y(TheThumbTarget);
+  RegisterMCAsmLexer<ARMAsmLexer> X(TheARMTarget);
+  RegisterMCAsmLexer<ThumbAsmLexer> Y(TheThumbTarget);
 }
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index a474127..24f15b4 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -7,11 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMMCExpr.h"
-#include "ARMBaseRegisterInfo.h"
-#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -20,12 +18,17 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -37,49 +40,65 @@ namespace {
 
 class ARMOperand;
 
-class ARMAsmParser : public TargetAsmParser {
+class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
+  struct {
+    ARMCC::CondCodes Cond;    // Condition for IT block.
+    unsigned Mask:4;          // Condition mask for instructions.
+                              // Starting at first 1 (from lsb).
+                              //   '1'  condition as indicated in IT.
+                              //   '0'  inverse of condition (else).
+                              // Count of instructions in IT block is
+                              // 4 - trailingzeroes(mask)
+
+    bool FirstCond;           // Explicit flag for when we're parsing the
+                              // First instruction in the IT block. It's
+                              // implied in the mask, so needs special
+                              // handling.
+
+    unsigned CurPosition;     // Current position in parsing of IT
+                              // block. In range [0,3]. Initialized
+                              // according to count of instructions in block.
+                              // ~0U if no active IT block.
+  } ITState;
+  bool inITBlock() { return ITState.CurPosition != ~0U;}
+  void forwardITPosition() {
+    if (!inITBlock()) return;
+    // Move to the next instruction in the IT block, if there is one. If not,
+    // mark the block as done.
+    unsigned TZ = CountTrailingZeros_32(ITState.Mask);
+    if (++ITState.CurPosition == 5 - TZ)
+      ITState.CurPosition = ~0U; // Done with the IT block after this.
+  }
+
+
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
   bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
 
-  int TryParseRegister();
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
-  bool TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
-  int TryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &,
-                   ARMII::AddrMode AddrMode);
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
-  bool ParsePrefix(ARMMCExpr::VariantKind &RefKind);
-  const MCExpr *ApplyPrefixToExpr(const MCExpr *E,
-                                  MCSymbolRefExpr::VariantKind Variant);
-
-
-  bool ParseMemoryOffsetReg(bool &Negative,
-                            bool &OffsetRegShifted,
-                            enum ARM_AM::ShiftOpc &ShiftType,
-                            const MCExpr *&ShiftAmount,
-                            const MCExpr *&Offset,
-                            bool &OffsetIsReg,
-                            int &OffsetRegNum,
-                            SMLoc &E);
-  bool ParseShift(enum ARM_AM::ShiftOpc &St,
-                  const MCExpr *&ShiftAmount, SMLoc &E);
-  bool ParseDirectiveWord(unsigned Size, SMLoc L);
-  bool ParseDirectiveThumb(SMLoc L);
-  bool ParseDirectiveThumbFunc(SMLoc L);
-  bool ParseDirectiveCode(SMLoc L);
-  bool ParseDirectiveSyntax(SMLoc L);
-
-  bool MatchAndEmitInstruction(SMLoc IDLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out);
-  void GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+  int tryParseRegister();
+  bool tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &);
+  int tryParseShiftRegister(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &, StringRef Mnemonic);
+  bool parsePrefix(ARMMCExpr::VariantKind &RefKind);
+  bool parseMemRegOffsetShift(ARM_AM::ShiftOpc &ShiftType,
+                              unsigned &ShiftAmount);
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveThumb(SMLoc L);
+  bool parseDirectiveThumbFunc(SMLoc L);
+  bool parseDirectiveCode(SMLoc L);
+  bool parseDirectiveSyntax(SMLoc L);
+
+  StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
+                          bool &CarrySetting, unsigned &ProcessorIMod,
+                          StringRef &ITMask);
+  void getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
                              bool &CanAcceptPredicationCode);
 
   bool isThumb() const {
@@ -89,10 +108,22 @@ class ARMAsmParser : public TargetAsmParser {
   bool isThumbOne() const {
     return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2) == 0;
   }
+  bool isThumbTwo() const {
+    return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2);
+  }
+  bool hasV6Ops() const {
+    return STI.getFeatureBits() & ARM::HasV6Ops;
+  }
+  bool hasV7Ops() const {
+    return STI.getFeatureBits() & ARM::HasV7Ops;
+  }
   void SwitchMode() {
     unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
   }
+  bool isMClass() const {
+    return STI.getFeatureBits() & ARM::FeatureMClass;
+  }
 
   /// @name Auto-generated Match Functions
   /// {
@@ -102,43 +133,108 @@ class ARMAsmParser : public TargetAsmParser {
 
   /// }
 
-  OperandMatchResultTy tryParseCoprocNumOperand(
-    SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseCoprocRegOperand(
+  OperandMatchResultTy parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseCoprocNumOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMemBarrierOptOperand(
+  OperandMatchResultTy parseCoprocRegOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseProcIFlagsOperand(
+  OperandMatchResultTy parseCoprocOptionOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMSRMaskOperand(
+  OperandMatchResultTy parseMemBarrierOptOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMemMode2Operand(
+  OperandMatchResultTy parseProcIFlagsOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
-  OperandMatchResultTy tryParseMemMode3Operand(
+  OperandMatchResultTy parseMSRMaskOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &O,
+                                   StringRef Op, int Low, int High);
+  OperandMatchResultTy parsePKHLSLImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+    return parsePKHImm(O, "lsl", 0, 31);
+  }
+  OperandMatchResultTy parsePKHASRImm(SmallVectorImpl<MCParsedAsmOperand*> &O) {
+    return parsePKHImm(O, "asr", 1, 32);
+  }
+  OperandMatchResultTy parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseRotImm(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseBitfield(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseFPImm(SmallVectorImpl<MCParsedAsmOperand*>&);
 
   // Asm Match Converter Methods
-  bool CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  bool cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  bool cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  bool cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                             const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdrdPre(MCInst &Inst, unsigned Opcode,
+                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtStrdPre(MCInst &Inst, unsigned Opcode,
+                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
+  bool cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+                        const SmallVectorImpl<MCParsedAsmOperand*> &);
+
+  bool validateInstruction(MCInst &Inst,
+                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  void processInstruction(MCInst &Inst,
+                          const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
+  bool shouldOmitCCOutOperand(StringRef Mnemonic,
+                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
 public:
+  enum ARMMatchResultTy {
+    Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_RequiresNotITBlock,
+    Match_RequiresV6,
+    Match_RequiresThumb2
+  };
+
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : TargetAsmParser(), STI(_STI), Parser(_Parser) {
+    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+    // Not in an ITBlock to start with.
+    ITState.CurPosition = ~0U;
   }
 
-  virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  virtual bool ParseDirective(AsmToken DirectiveID);
+  // Implementation of the MCTargetAsmParser interface:
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseInstruction(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseDirective(AsmToken DirectiveID);
+
+  unsigned checkTargetMatchPredicate(MCInst &Inst);
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               MCStreamer &Out);
 };
 } // end anonymous namespace
 
@@ -148,22 +244,30 @@ namespace {
 /// instruction.
 class ARMOperand : public MCParsedAsmOperand {
   enum KindTy {
-    CondCode,
-    CCOut,
-    CoprocNum,
-    CoprocReg,
-    Immediate,
-    MemBarrierOpt,
-    Memory,
-    MSRMask,
-    ProcIFlags,
-    Register,
-    RegisterList,
-    DPRRegisterList,
-    SPRRegisterList,
-    ShiftedRegister,
-    Shifter,
-    Token
+    k_CondCode,
+    k_CCOut,
+    k_ITCondMask,
+    k_CoprocNum,
+    k_CoprocReg,
+    k_CoprocOption,
+    k_Immediate,
+    k_FPImmediate,
+    k_MemBarrierOpt,
+    k_Memory,
+    k_PostIndexRegister,
+    k_MSRMask,
+    k_ProcIFlags,
+    k_VectorIndex,
+    k_Register,
+    k_RegisterList,
+    k_DPRRegisterList,
+    k_SPRRegisterList,
+    k_ShiftedRegister,
+    k_ShiftedImmediate,
+    k_ShifterImmediate,
+    k_RotateImmediate,
+    k_BitfieldDescriptor,
+    k_Token
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -175,12 +279,20 @@ class ARMOperand : public MCParsedAsmOperand {
     } CC;
 
     struct {
-      ARM_MB::MemBOpt Val;
-    } MBOpt;
+      unsigned Val;
+    } Cop;
 
     struct {
       unsigned Val;
-    } Cop;
+    } CoprocOption;
+
+    struct {
+      unsigned Mask:4;
+    } ITMask;
+
+    struct {
+      ARM_MB::MemBOpt Val;
+    } MBOpt;
 
     struct {
       ARM_PROC::IFlags Val;
@@ -200,37 +312,60 @@ class ARMOperand : public MCParsedAsmOperand {
     } Reg;
 
     struct {
+      unsigned Val;
+    } VectorIndex;
+
+    struct {
       const MCExpr *Val;
     } Imm;
 
+    struct {
+      unsigned Val;       // encoded 8-bit representation
+    } FPImm;
+
     /// Combined record for all forms of ARM address expressions.
     struct {
-      ARMII::AddrMode AddrMode;
       unsigned BaseRegNum;
-      union {
-        unsigned RegNum;     ///< Offset register num, when OffsetIsReg.
-        const MCExpr *Value; ///< Offset value, when !OffsetIsReg.
-      } Offset;
-      const MCExpr *ShiftAmount;     // used when OffsetRegShifted is true
-      enum ARM_AM::ShiftOpc ShiftType; // used when OffsetRegShifted is true
-      unsigned OffsetRegShifted : 1; // only used when OffsetIsReg is true
-      unsigned Preindexed       : 1;
-      unsigned Postindexed      : 1;
-      unsigned OffsetIsReg      : 1;
-      unsigned Negative         : 1; // only used when OffsetIsReg is true
-      unsigned Writeback        : 1;
-    } Mem;
+      // Offset is in OffsetReg or OffsetImm. If both are zero, no offset
+      // was specified.
+      const MCConstantExpr *OffsetImm;  // Offset immediate value
+      unsigned OffsetRegNum;    // Offset register num, when OffsetImm == NULL
+      ARM_AM::ShiftOpc ShiftType; // Shift type for OffsetReg
+      unsigned ShiftImm;        // shift for OffsetReg.
+      unsigned Alignment;       // 0 = no alignment specified
+                                // n = alignment in bytes (8, 16, or 32)
+      unsigned isNegative : 1;  // Negated OffsetReg? (~'U' bit)
+    } Memory;
 
     struct {
+      unsigned RegNum;
+      bool isAdd;
       ARM_AM::ShiftOpc ShiftTy;
+      unsigned ShiftImm;
+    } PostIdxReg;
+
+    struct {
+      bool isASR;
       unsigned Imm;
-    } Shift;
+    } ShifterImm;
     struct {
       ARM_AM::ShiftOpc ShiftTy;
       unsigned SrcReg;
       unsigned ShiftReg;
       unsigned ShiftImm;
-    } ShiftedReg;
+    } RegShiftedReg;
+    struct {
+      ARM_AM::ShiftOpc ShiftTy;
+      unsigned SrcReg;
+      unsigned ShiftImm;
+    } RegShiftedImm;
+    struct {
+      unsigned Imm;
+    } RotImm;
+    struct {
+      unsigned LSB;
+      unsigned Width;
+    } Bitfield;
   };
 
   ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -240,45 +375,69 @@ public:
     StartLoc = o.StartLoc;
     EndLoc = o.EndLoc;
     switch (Kind) {
-    case CondCode:
+    case k_CondCode:
       CC = o.CC;
       break;
-    case Token:
+    case k_ITCondMask:
+      ITMask = o.ITMask;
+      break;
+    case k_Token:
       Tok = o.Tok;
       break;
-    case CCOut:
-    case Register:
+    case k_CCOut:
+    case k_Register:
       Reg = o.Reg;
       break;
-    case RegisterList:
-    case DPRRegisterList:
-    case SPRRegisterList:
+    case k_RegisterList:
+    case k_DPRRegisterList:
+    case k_SPRRegisterList:
       Registers = o.Registers;
       break;
-    case CoprocNum:
-    case CoprocReg:
+    case k_CoprocNum:
+    case k_CoprocReg:
       Cop = o.Cop;
       break;
-    case Immediate:
+    case k_CoprocOption:
+      CoprocOption = o.CoprocOption;
+      break;
+    case k_Immediate:
       Imm = o.Imm;
       break;
-    case MemBarrierOpt:
+    case k_FPImmediate:
+      FPImm = o.FPImm;
+      break;
+    case k_MemBarrierOpt:
       MBOpt = o.MBOpt;
       break;
-    case Memory:
-      Mem = o.Mem;
+    case k_Memory:
+      Memory = o.Memory;
+      break;
+    case k_PostIndexRegister:
+      PostIdxReg = o.PostIdxReg;
       break;
-    case MSRMask:
+    case k_MSRMask:
       MMask = o.MMask;
       break;
-    case ProcIFlags:
+    case k_ProcIFlags:
       IFlags = o.IFlags;
       break;
-    case Shifter:
-      Shift = o.Shift;
+    case k_ShifterImmediate:
+      ShifterImm = o.ShifterImm;
+      break;
+    case k_ShiftedRegister:
+      RegShiftedReg = o.RegShiftedReg;
+      break;
+    case k_ShiftedImmediate:
+      RegShiftedImm = o.RegShiftedImm;
       break;
-    case ShiftedRegister:
-      ShiftedReg = o.ShiftedReg;
+    case k_RotateImmediate:
+      RotImm = o.RotImm;
+      break;
+    case k_BitfieldDescriptor:
+      Bitfield = o.Bitfield;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
       break;
     }
   }
@@ -289,94 +448,96 @@ public:
   SMLoc getEndLoc() const { return EndLoc; }
 
   ARMCC::CondCodes getCondCode() const {
-    assert(Kind == CondCode && "Invalid access!");
+    assert(Kind == k_CondCode && "Invalid access!");
     return CC.Val;
   }
 
   unsigned getCoproc() const {
-    assert((Kind == CoprocNum || Kind == CoprocReg) && "Invalid access!");
+    assert((Kind == k_CoprocNum || Kind == k_CoprocReg) && "Invalid access!");
     return Cop.Val;
   }
 
   StringRef getToken() const {
-    assert(Kind == Token && "Invalid access!");
+    assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
 
   unsigned getReg() const {
-    assert((Kind == Register || Kind == CCOut) && "Invalid access!");
+    assert((Kind == k_Register || Kind == k_CCOut) && "Invalid access!");
     return Reg.RegNum;
   }
 
   const SmallVectorImpl<unsigned> &getRegList() const {
-    assert((Kind == RegisterList || Kind == DPRRegisterList ||
-            Kind == SPRRegisterList) && "Invalid access!");
+    assert((Kind == k_RegisterList || Kind == k_DPRRegisterList ||
+            Kind == k_SPRRegisterList) && "Invalid access!");
     return Registers;
   }
 
   const MCExpr *getImm() const {
-    assert(Kind == Immediate && "Invalid access!");
+    assert(Kind == k_Immediate && "Invalid access!");
     return Imm.Val;
   }
 
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImmediate && "Invalid access!");
+    return FPImm.Val;
+  }
+
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
+  }
+
   ARM_MB::MemBOpt getMemBarrierOpt() const {
-    assert(Kind == MemBarrierOpt && "Invalid access!");
+    assert(Kind == k_MemBarrierOpt && "Invalid access!");
     return MBOpt.Val;
   }
 
   ARM_PROC::IFlags getProcIFlags() const {
-    assert(Kind == ProcIFlags && "Invalid access!");
+    assert(Kind == k_ProcIFlags && "Invalid access!");
     return IFlags.Val;
   }
 
   unsigned getMSRMask() const {
-    assert(Kind == MSRMask && "Invalid access!");
+    assert(Kind == k_MSRMask && "Invalid access!");
     return MMask.Val;
   }
 
-  /// @name Memory Operand Accessors
-  /// @{
-  ARMII::AddrMode getMemAddrMode() const {
-    return Mem.AddrMode;
-  }
-  unsigned getMemBaseRegNum() const {
-    return Mem.BaseRegNum;
-  }
-  unsigned getMemOffsetRegNum() const {
-    assert(Mem.OffsetIsReg && "Invalid access!");
-    return Mem.Offset.RegNum;
-  }
-  const MCExpr *getMemOffset() const {
-    assert(!Mem.OffsetIsReg && "Invalid access!");
-    return Mem.Offset.Value;
-  }
-  unsigned getMemOffsetRegShifted() const {
-    assert(Mem.OffsetIsReg && "Invalid access!");
-    return Mem.OffsetRegShifted;
+  bool isCoprocNum() const { return Kind == k_CoprocNum; }
+  bool isCoprocReg() const { return Kind == k_CoprocReg; }
+  bool isCoprocOption() const { return Kind == k_CoprocOption; }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isCCOut() const { return Kind == k_CCOut; }
+  bool isITMask() const { return Kind == k_ITCondMask; }
+  bool isITCondCode() const { return Kind == k_CondCode; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isFPImm() const { return Kind == k_FPImmediate; }
+  bool isImm8s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
   }
-  const MCExpr *getMemShiftAmount() const {
-    assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
-    return Mem.ShiftAmount;
+  bool isImm0_1020s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
   }
-  enum ARM_AM::ShiftOpc getMemShiftType() const {
-    assert(Mem.OffsetIsReg && Mem.OffsetRegShifted && "Invalid access!");
-    return Mem.ShiftType;
+  bool isImm0_508s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
   }
-  bool getMemPreindexed() const { return Mem.Preindexed; }
-  bool getMemPostindexed() const { return Mem.Postindexed; }
-  bool getMemOffsetIsReg() const { return Mem.OffsetIsReg; }
-  bool getMemNegative() const { return Mem.Negative; }
-  bool getMemWriteback() const { return Mem.Writeback; }
-
-  /// @}
-
-  bool isCoprocNum() const { return Kind == CoprocNum; }
-  bool isCoprocReg() const { return Kind == CoprocReg; }
-  bool isCondCode() const { return Kind == CondCode; }
-  bool isCCOut() const { return Kind == CCOut; }
-  bool isImm() const { return Kind == Immediate; }
   bool isImm0_255() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
@@ -384,7 +545,7 @@ public:
     return Value >= 0 && Value < 256;
   }
   bool isImm0_7() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
@@ -392,130 +553,365 @@ public:
     return Value >= 0 && Value < 8;
   }
   bool isImm0_15() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 16;
   }
+  bool isImm0_31() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
+  }
+  bool isImm1_16() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 17;
+  }
+  bool isImm1_32() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value < 33;
+  }
   bool isImm0_65535() const {
-    if (Kind != Immediate)
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 65536;
   }
-  bool isT2SOImm() const {
-    if (Kind != Immediate)
+  bool isImm0_65535Expr() const {
+    if (Kind != k_Immediate)
       return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
     int64_t Value = CE->getValue();
-    return ARM_AM::getT2SOImmVal(Value) != -1;
+    return Value >= 0 && Value < 65536;
   }
-  bool isReg() const { return Kind == Register; }
-  bool isRegList() const { return Kind == RegisterList; }
-  bool isDPRRegList() const { return Kind == DPRRegisterList; }
-  bool isSPRRegList() const { return Kind == SPRRegisterList; }
-  bool isToken() const { return Kind == Token; }
-  bool isMemBarrierOpt() const { return Kind == MemBarrierOpt; }
-  bool isMemory() const { return Kind == Memory; }
-  bool isShifter() const { return Kind == Shifter; }
-  bool isShiftedReg() const { return Kind == ShiftedRegister; }
-  bool isMemMode2() const {
-    if (getMemAddrMode() != ARMII::AddrMode2)
+  bool isImm24bit() const {
+    if (Kind != k_Immediate)
       return false;
-
-    if (getMemOffsetIsReg())
-      return true;
-
-    if (getMemNegative() &&
-        !(getMemPostindexed() || getMemPreindexed()))
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value <= 0xffffff;
+  }
+  bool isImmThumbSR() const {
+    if (Kind != k_Immediate)
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-
-    // The offset must be in the range 0-4095 (imm12).
-    if (Value > 4095 || Value < -4095)
+    return Value > 0 && Value < 33;
+  }
+  bool isPKHLSLImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 32;
   }
-  bool isMemMode3() const {
-    if (getMemAddrMode() != ARMII::AddrMode3)
+  bool isPKHASRImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    if (getMemOffsetIsReg()) {
-      if (getMemOffsetRegShifted())
-        return false; // No shift with offset reg allowed
-      return true;
-    }
-
-    if (getMemNegative() &&
-        !(getMemPostindexed() || getMemPreindexed()))
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= 32;
+  }
+  bool isARMSOImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-
-    // The offset must be in the range 0-255 (imm8).
-    if (Value > 255 || Value < -255)
+    return ARM_AM::getSOImmVal(Value) != -1;
+  }
+  bool isT2SOImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ARM_AM::getT2SOImmVal(Value) != -1;
   }
-  bool isMemMode5() const {
-    if (!isMemory() || getMemOffsetIsReg() || getMemWriteback() ||
-        getMemNegative())
+  bool isSetEndImm() const {
+    if (Kind != k_Immediate)
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
-
-    // The offset must be a multiple of 4 in the range 0-1020.
     int64_t Value = CE->getValue();
-    return ((Value & 0x3) == 0 && Value <= 1020 && Value >= -1020);
-  }
-  bool isMemMode7() const {
-    if (!isMemory() ||
-        getMemPreindexed() ||
-        getMemPostindexed() ||
-        getMemOffsetIsReg() ||
-        getMemNegative() ||
-        getMemWriteback())
+    return Value == 1 || Value == 0;
+  }
+  bool isReg() const { return Kind == k_Register; }
+  bool isRegList() const { return Kind == k_RegisterList; }
+  bool isDPRRegList() const { return Kind == k_DPRRegisterList; }
+  bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+  bool isMemory() const { return Kind == k_Memory; }
+  bool isShifterImm() const { return Kind == k_ShifterImmediate; }
+  bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
+  bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
+  bool isRotImm() const { return Kind == k_RotateImmediate; }
+  bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
+  bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
+  bool isPostIdxReg() const {
+    return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy == ARM_AM::no_shift;
+  }
+  bool isMemNoOffset(bool alignOK = false) const {
+    if (!isMemory())
       return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    // No offset of any kind.
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 &&
+     (alignOK || Memory.Alignment == 0);
+  }
+  bool isAlignedMemory() const {
+    return isMemNoOffset(true);
+  }
+  bool isAddrMode2() const {
+    if (!isMemory() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAM2OffsetImm() const {
+    if (Kind != k_Immediate)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
-
-    if (CE->getValue())
+    int64_t Val = CE->getValue();
+    return Val > -4096 && Val < 4096;
+  }
+  bool isAddrMode3() const {
+    if (!isMemory() || Memory.Alignment != 0) return false;
+    // No shifts are legal for AM3.
+    if (Memory.ShiftType != ARM_AM::no_shift) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return true;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -256 && Val < 256;
+  }
+  bool isAM3Offset() const {
+    if (Kind != k_Immediate && Kind != k_PostIndexRegister)
+      return false;
+    if (Kind == k_PostIndexRegister)
+      return PostIdxReg.ShiftTy == ARM_AM::no_shift;
+    // Immediate offset in range [-255, 255].
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    // Special case, #-0 is INT32_MIN.
+    return (Val > -256 && Val < 256) || Val == INT32_MIN;
+  }
+  bool isAddrMode5() const {
+    if (!isMemory() || Memory.Alignment != 0) return false;
+    // Check for register offset.
+    if (Memory.OffsetRegNum) return false;
+    // Immediate offset in range [-1020, 1020] and a multiple of 4.
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= -1020 && Val <= 1020 && ((Val & 3) == 0)) ||
+           Val == INT32_MIN;
+  }
+  bool isMemTBB() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return true;
+  }
+  bool isMemTBH() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
+        Memory.Alignment != 0 )
+      return false;
+    return true;
+  }
+  bool isMemRegOffset() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.Alignment != 0)
       return false;
-
     return true;
   }
-  bool isMemModeRegThumb() const {
-    if (!isMemory() || !getMemOffsetIsReg() || getMemWriteback())
+  bool isT2MemRegOffset() const {
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.Alignment != 0)
+      return false;
+    // Only lsl #{0, 1, 2, 3} allowed.
+    if (Memory.ShiftType == ARM_AM::no_shift)
+      return true;
+    if (Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm > 3)
       return false;
     return true;
   }
-  bool isMemModeImmThumb() const {
-    if (!isMemory() || getMemOffsetIsReg() || getMemWriteback())
+  bool isMemThumbRR() const {
+    // Thumb reg+reg addressing is simple. Just two registers, a base and
+    // an offset. No shifts, negations or any other complicating factors.
+    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
+      return false;
+    return isARMLowRegister(Memory.BaseRegNum) &&
+      (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
+  }
+  bool isMemThumbRIs4() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 124].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 124 && (Val % 4) == 0;
+  }
+  bool isMemThumbRIs2() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 62].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 62 && (Val % 2) == 0;
+  }
+  bool isMemThumbRIs1() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 31].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 31;
+  }
+  bool isMemThumbSPI() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+        Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
+      return false;
+    // Immediate offset, multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val % 4) == 0;
+  }
+  bool isMemImm8s4Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [-1020, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= -1020 && Val <= 1020 && (Val & 3) == 0;
+  }
+  bool isMemImm0_1020s4Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset a multiple of 4 in range [0, 1020].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
+  }
+  bool isMemImm8Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-255, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val == INT32_MIN) || (Val > -256 && Val < 256);
+  }
+  bool isMemPosImm8Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [0, 255].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val >= 0 && Val < 256;
+  }
+  bool isMemNegImm8Offset() const {
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-255, -1].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return Val > -256 && Val < 0;
+  }
+  bool isMemUImm12Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (Kind == k_Immediate && !isa<MCConstantExpr>(getImm()))
+      return true;
+
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
+    // Immediate offset in range [0, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val >= 0 && Val < 4096);
+  }
+  bool isMemImm12Offset() const {
+    // If we have an immediate that's not a constant, treat it as a label
+    // reference needing a fixup. If it is a constant, it's something else
+    // and we reject it.
+    if (Kind == k_Immediate && !isa<MCConstantExpr>(getImm()))
+      return true;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
+    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+    // Immediate offset in range [-4095, 4095].
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+    return (Val > -4096 && Val < 4096) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return (Val > -256 && Val < 256) || (Val == INT32_MIN);
+  }
+  bool isPostIdxImm8s4() const {
+    if (Kind != k_Immediate)
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
+    int64_t Val = CE->getValue();
+    return ((Val & 3) == 0 && Val >= -1020 && Val <= 1020) ||
+      (Val == INT32_MIN);
+  }
 
-    // The offset must be a multiple of 4 in the range 0-124.
-    uint64_t Value = CE->getValue();
-    return ((Value & 0x3) == 0 && Value <= 124);
+  bool isMSRMask() const { return Kind == k_MSRMask; }
+  bool isProcIFlags() const { return Kind == k_ProcIFlags; }
+
+  bool isVectorIndex8() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 8;
+  }
+  bool isVectorIndex16() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 4;
   }
-  bool isMSRMask() const { return Kind == MSRMask; }
-  bool isProcIFlags() const { return Kind == ProcIFlags; }
+  bool isVectorIndex32() const {
+    if (Kind != k_VectorIndex) return false;
+    return VectorIndex.Val < 2;
+  }
+
+
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
@@ -544,6 +940,21 @@ public:
     Inst.addOperand(MCOperand::CreateImm(getCoproc()));
   }
 
+  void addCoprocOptionOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(CoprocOption.Val));
+  }
+
+  void addITMaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(ITMask.Mask));
+  }
+
+  void addITCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getCondCode())));
+  }
+
   void addCCOutOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(getReg()));
@@ -554,22 +965,27 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addShiftedRegOperands(MCInst &Inst, unsigned N) const {
+  void addRegShiftedRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
-    assert(isShiftedReg() && "addShiftedRegOperands() on non ShiftedReg!");
-    assert((ShiftedReg.ShiftReg == 0 ||
-            ARM_AM::getSORegOffset(ShiftedReg.ShiftImm) == 0) &&
-           "Invalid shifted register operand!");
-    Inst.addOperand(MCOperand::CreateReg(ShiftedReg.SrcReg));
-    Inst.addOperand(MCOperand::CreateReg(ShiftedReg.ShiftReg));
+    assert(isRegShiftedReg() && "addRegShiftedRegOperands() on non RegShiftedReg!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.SrcReg));
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedReg.ShiftReg));
     Inst.addOperand(MCOperand::CreateImm(
-      ARM_AM::getSORegOpc(ShiftedReg.ShiftTy, ShiftedReg.ShiftImm)));
+      ARM_AM::getSORegOpc(RegShiftedReg.ShiftTy, RegShiftedReg.ShiftImm)));
   }
 
-  void addShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  void addRegShiftedImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    assert(isRegShiftedImm() && "addRegShiftedImmOperands() on non RegShiftedImm!");
+    Inst.addOperand(MCOperand::CreateReg(RegShiftedImm.SrcReg));
     Inst.addOperand(MCOperand::CreateImm(
-      ARM_AM::getSORegOpc(Shift.ShiftTy, 0)));
+      ARM_AM::getSORegOpc(RegShiftedImm.ShiftTy, RegShiftedImm.ShiftImm)));
+  }
+
+  void addShifterImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm((ShifterImm.isASR << 5) |
+                                         ShifterImm.Imm));
   }
 
   void addRegListOperands(MCInst &Inst, unsigned N) const {
@@ -588,11 +1004,57 @@ public:
     addRegListOperands(Inst, N);
   }
 
+  void addRotImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Encoded as val>>3. The printer handles display as 8, 16, 24.
+    Inst.addOperand(MCOperand::CreateImm(RotImm.Imm >> 3));
+  }
+
+  void addBitfieldOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // Munge the lsb/width into a bitfield mask.
+    unsigned lsb = Bitfield.LSB;
+    unsigned width = Bitfield.Width;
+    // Make a 32-bit mask w/ the referenced bits clear and all other bits set.
+    uint32_t Mask = ~(((uint32_t)0xffffffff >> lsb) << (32 - width) >>
+                      (32 - (lsb + width)));
+    Inst.addOperand(MCOperand::CreateImm(Mask));
+  }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
 
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
+
+  void addImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // FIXME: We really want to scale the value here, but the LDRD/STRD
+    // instruction don't encode operands that way yet.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  }
+
+  void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
+  void addImm0_508s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate is scaled by four in the encoding and is stored
+    // in the MCInst as such. Lop off the low two bits here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() / 4));
+  }
+
   void addImm0_255Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
@@ -608,137 +1070,344 @@ public:
     addExpr(Inst, getImm());
   }
 
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate-1, and we store in the instruction
+    // the bits as encoded, so subtract off one here.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+  }
+
   void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
 
+  void addImm0_65535ExprOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addImm24bitOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addImmThumbSROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The constant encodes as the immediate, except for 32, which encodes as
+    // zero.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Imm = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Imm == 32 ? 0 : Imm)));
+  }
+
+  void addPKHLSLImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
+  void addPKHASRImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // An ASR value of 32 encodes as 0, so that's how we want to add it to
+    // the instruction as well.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    int Val = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm(Val == 32 ? 0 : Val));
+  }
+
+  void addARMSOImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
   void addT2SOImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
 
+  void addSetEndImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
+  }
+
   void addMemBarrierOptOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
   }
 
-  void addMemMode7Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemMode7() && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    (void)CE;
-    assert((CE || CE->getValue() == 0) &&
-           "No offset operand support in mode 7");
+  void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
   }
 
-  void addMemMode2Operands(MCInst &Inst, unsigned N) const {
-    assert(isMemMode2() && "Invalid mode or number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1);
+  void addAlignedMemoryOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.Alignment));
+  }
 
-    if (getMemOffsetIsReg()) {
-      Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+  void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                              Memory.ShiftImm, Memory.ShiftType);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-      ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add;
-      ARM_AM::ShiftOpc ShOpc = ARM_AM::no_shift;
-      int64_t ShiftAmount = 0;
+  void addAM2OffsetImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant AM2OffsetImm operand!");
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM2Opc(AddSub, Val, ARM_AM::no_shift);
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-      if (getMemOffsetRegShifted()) {
-        ShOpc = getMemShiftType();
-        const MCConstantExpr *CE =
-                   dyn_cast<MCConstantExpr>(getMemShiftAmount());
-        ShiftAmount = CE->getValue();
-      }
+  void addAddrMode3Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    if (!Memory.OffsetRegNum) {
+      ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+      // Special case for #-0
+      if (Val == INT32_MIN) Val = 0;
+      if (Val < 0) Val = -Val;
+      Val = ARM_AM::getAM3Opc(AddSub, Val);
+    } else {
+      // For register offset, we encode the shift type and negation flag
+      // here.
+      Val = ARM_AM::getAM3Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add, 0);
+    }
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(AMOpc, ShiftAmount,
-                                           ShOpc, IdxMode)));
+  void addAM3OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (Kind == k_PostIndexRegister) {
+      int32_t Val =
+        ARM_AM::getAM3Opc(PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub, 0);
+      Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+      Inst.addOperand(MCOperand::CreateImm(Val));
       return;
     }
 
-    // Create a operand placeholder to always yield the same number of operands.
+    // Constant offset.
+    const MCConstantExpr *CE = static_cast<const MCConstantExpr*>(getImm());
+    int32_t Val = CE->getValue();
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM3Opc(AddSub, Val);
     Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
-    // the difference?
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode 2 offset operand!");
-    int64_t Offset = CE->getValue();
+  void addAddrMode5Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    ARM_AM::AddrOpc AddSub = Val < 0 ? ARM_AM::sub : ARM_AM::add;
+    // Special case for #-0
+    if (Val == INT32_MIN) Val = 0;
+    if (Val < 0) Val = -Val;
+    Val = ARM_AM::getAM5Opc(AddSub, Val);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    if (Offset >= 0)
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::add,
-                                           Offset, ARM_AM::no_shift, IdxMode)));
-    else
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM2Opc(ARM_AM::sub,
-                                          -Offset, ARM_AM::no_shift, IdxMode)));
+  void addMemImm8s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm0_1020s4OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // The lower two bits are always zero and as such are not encoded.
+    int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() / 4 : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
+  }
+
+  void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+    addMemImm8OffsetOperands(Inst, N);
   }
 
-  void addMemMode3Operands(MCInst &Inst, unsigned N) const {
-    assert(isMemMode3() && "Invalid mode or number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    unsigned IdxMode = (getMemPreindexed() | getMemPostindexed() << 1);
+  void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (Kind == k_Immediate) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+      return;
+    }
 
-    if (getMemOffsetIsReg()) {
-      Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-      ARM_AM::AddrOpc AMOpc = getMemNegative() ? ARM_AM::sub : ARM_AM::add;
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(AMOpc, 0,
-                                                             IdxMode)));
+  void addMemImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    // If this is an immediate, it's a label reference.
+    if (Kind == k_Immediate) {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
       return;
     }
 
-    // Create a operand placeholder to always yield the same number of operands.
-    Inst.addOperand(MCOperand::CreateReg(0));
+    // Otherwise, it's a normal memory reg+offset.
+    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
-    // the difference?
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode 3 offset operand!");
-    int64_t Offset = CE->getValue();
+  void addMemTBBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
 
-    if (Offset >= 0)
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::add,
-                                           Offset, IdxMode)));
-    else
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM3Opc(ARM_AM::sub,
-                                           -Offset, IdxMode)));
+  void addMemTBHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
   }
 
-  void addMemMode5Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemMode5() && "Invalid number of operands!");
+  void addMemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    unsigned Val = ARM_AM::getAM2Opc(Memory.isNegative ? ARM_AM::sub : ARM_AM::add,
+                                     Memory.ShiftImm, Memory.ShiftType);
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
 
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    assert(!getMemOffsetIsReg() && "Invalid mode 5 operand");
+  void addT2MemRegOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Memory.ShiftImm));
+  }
 
-    // FIXME: #-0 is encoded differently than #0. Does the parser preserve
-    // the difference?
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode 5 offset operand!");
+  void addMemThumbRROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateReg(Memory.OffsetRegNum));
+  }
 
-    // The MCInst offset operand doesn't include the low two bits (like
-    // the instruction encoding).
-    int64_t Offset = CE->getValue() / 4;
-    if (Offset >= 0)
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add,
-                                                             Offset)));
-    else
-      Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub,
-                                                             -Offset)));
+  void addMemThumbRIs4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
   }
 
-  void addMemModeRegThumbOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemModeRegThumb() && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    Inst.addOperand(MCOperand::CreateReg(getMemOffsetRegNum()));
+  void addMemThumbRIs2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 2) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
   }
 
-  void addMemModeImmThumbOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemModeImmThumb() && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getMemBaseRegNum()));
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemOffset());
-    assert(CE && "Non-constant mode offset operand!");
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  void addMemThumbRIs1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue()) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addMemThumbSPIOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    int64_t Val = Memory.OffsetImm ? (Memory.OffsetImm->getValue() / 4) : 0;
+    Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::CreateImm(Val));
+  }
+
+  void addPostIdxImm8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    Imm = (Imm < 0 ? -Imm : Imm) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxImm8s4Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE && "non-constant post-idx-imm8s4 operand!");
+    int Imm = CE->getValue();
+    bool isAdd = Imm >= 0;
+    if (Imm == INT32_MIN) Imm = 0;
+    // Immediate is scaled by 4.
+    Imm = ((Imm < 0 ? -Imm : Imm) / 4) | (int)isAdd << 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addPostIdxRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    Inst.addOperand(MCOperand::CreateImm(PostIdxReg.isAdd));
+  }
+
+  void addPostIdxRegShiftedOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(PostIdxReg.RegNum));
+    // The sign, shift type, and shift amount are encoded in a single operand
+    // using the AM2 encoding helpers.
+    ARM_AM::AddrOpc opc = PostIdxReg.isAdd ? ARM_AM::add : ARM_AM::sub;
+    unsigned Imm = ARM_AM::getAM2Opc(opc, PostIdxReg.ShiftImm,
+                                     PostIdxReg.ShiftTy);
+    Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
   void addMSRMaskOperands(MCInst &Inst, unsigned N) const {
@@ -751,10 +1420,33 @@ public:
     Inst.addOperand(MCOperand::CreateImm(unsigned(getProcIFlags())));
   }
 
+  void addVectorIndex8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
+  void addVectorIndex32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
+  }
+
   virtual void print(raw_ostream &OS) const;
 
+  static ARMOperand *CreateITMask(unsigned Mask, SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_ITCondMask);
+    Op->ITMask.Mask = Mask;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static ARMOperand *CreateCondCode(ARMCC::CondCodes CC, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CondCode);
+    ARMOperand *Op = new ARMOperand(k_CondCode);
     Op->CC.Val = CC;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -762,7 +1454,7 @@ public:
   }
 
   static ARMOperand *CreateCoprocNum(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CoprocNum);
+    ARMOperand *Op = new ARMOperand(k_CoprocNum);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -770,15 +1462,23 @@ public:
   }
 
   static ARMOperand *CreateCoprocReg(unsigned CopVal, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CoprocReg);
+    ARMOperand *Op = new ARMOperand(k_CoprocReg);
     Op->Cop.Val = CopVal;
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
   }
 
+  static ARMOperand *CreateCoprocOption(unsigned Val, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_CoprocOption);
+    Op->Cop.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static ARMOperand *CreateCCOut(unsigned RegNum, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(CCOut);
+    ARMOperand *Op = new ARMOperand(k_CCOut);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -786,7 +1486,7 @@ public:
   }
 
   static ARMOperand *CreateToken(StringRef Str, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(Token);
+    ARMOperand *Op = new ARMOperand(k_Token);
     Op->Tok.Data = Str.data();
     Op->Tok.Length = Str.size();
     Op->StartLoc = S;
@@ -795,7 +1495,7 @@ public:
   }
 
   static ARMOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(Register);
+    ARMOperand *Op = new ARMOperand(k_Register);
     Op->Reg.RegNum = RegNum;
     Op->StartLoc = S;
     Op->EndLoc = E;
@@ -807,20 +1507,52 @@ public:
                                            unsigned ShiftReg,
                                            unsigned ShiftImm,
                                            SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(ShiftedRegister);
-    Op->ShiftedReg.ShiftTy = ShTy;
-    Op->ShiftedReg.SrcReg = SrcReg;
-    Op->ShiftedReg.ShiftReg = ShiftReg;
-    Op->ShiftedReg.ShiftImm = ShiftImm;
+    ARMOperand *Op = new ARMOperand(k_ShiftedRegister);
+    Op->RegShiftedReg.ShiftTy = ShTy;
+    Op->RegShiftedReg.SrcReg = SrcReg;
+    Op->RegShiftedReg.ShiftReg = ShiftReg;
+    Op->RegShiftedReg.ShiftImm = ShiftImm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateShiftedImmediate(ARM_AM::ShiftOpc ShTy,
+                                            unsigned SrcReg,
+                                            unsigned ShiftImm,
+                                            SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_ShiftedImmediate);
+    Op->RegShiftedImm.ShiftTy = ShTy;
+    Op->RegShiftedImm.SrcReg = SrcReg;
+    Op->RegShiftedImm.ShiftImm = ShiftImm;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateShifter(ARM_AM::ShiftOpc ShTy,
+  static ARMOperand *CreateShifterImm(bool isASR, unsigned Imm,
                                    SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(Shifter);
-    Op->Shift.ShiftTy = ShTy;
+    ARMOperand *Op = new ARMOperand(k_ShifterImmediate);
+    Op->ShifterImm.isASR = isASR;
+    Op->ShifterImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateRotImm(unsigned Imm, SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_RotateImmediate);
+    Op->RotImm.Imm = Imm;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static ARMOperand *CreateBitfield(unsigned LSB, unsigned Width,
+                                    SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_BitfieldDescriptor);
+    Op->Bitfield.LSB = LSB;
+    Op->Bitfield.Width = Width;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
@@ -829,12 +1561,13 @@ public:
   static ARMOperand *
   CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs,
                 SMLoc StartLoc, SMLoc EndLoc) {
-    KindTy Kind = RegisterList;
+    KindTy Kind = k_RegisterList;
 
-    if (ARM::DPRRegClass.contains(Regs.front().first))
-      Kind = DPRRegisterList;
-    else if (ARM::SPRRegClass.contains(Regs.front().first))
-      Kind = SPRRegisterList;
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().first))
+      Kind = k_DPRRegisterList;
+    else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
+             contains(Regs.front().first))
+      Kind = k_SPRRegisterList;
 
     ARMOperand *Op = new ARMOperand(Kind);
     for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
@@ -846,55 +1579,68 @@ public:
     return Op;
   }
 
+  static ARMOperand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+                                       MCContext &Ctx) {
+    ARMOperand *Op = new ARMOperand(k_VectorIndex);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static ARMOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    ARMOperand *Op = new ARMOperand(Immediate);
+    ARMOperand *Op = new ARMOperand(k_Immediate);
     Op->Imm.Val = Val;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
-  static ARMOperand *CreateMem(ARMII::AddrMode AddrMode, unsigned BaseRegNum,
-                               bool OffsetIsReg, const MCExpr *Offset,
-                               int OffsetRegNum, bool OffsetRegShifted,
-                               enum ARM_AM::ShiftOpc ShiftType,
-                               const MCExpr *ShiftAmount, bool Preindexed,
-                               bool Postindexed, bool Negative, bool Writeback,
+  static ARMOperand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
+    ARMOperand *Op = new ARMOperand(k_FPImmediate);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static ARMOperand *CreateMem(unsigned BaseRegNum,
+                               const MCConstantExpr *OffsetImm,
+                               unsigned OffsetRegNum,
+                               ARM_AM::ShiftOpc ShiftType,
+                               unsigned ShiftImm,
+                               unsigned Alignment,
+                               bool isNegative,
                                SMLoc S, SMLoc E) {
-    assert((OffsetRegNum == -1 || OffsetIsReg) &&
-           "OffsetRegNum must imply OffsetIsReg!");
-    assert((!OffsetRegShifted || OffsetIsReg) &&
-           "OffsetRegShifted must imply OffsetIsReg!");
-    assert((Offset || OffsetIsReg) &&
-           "Offset must exists unless register offset is used!");
-    assert((!ShiftAmount || (OffsetIsReg && OffsetRegShifted)) &&
-           "Cannot have shift amount without shifted register offset!");
-    assert((!Offset || !OffsetIsReg) &&
-           "Cannot have expression offset and register offset!");
-
-    ARMOperand *Op = new ARMOperand(Memory);
-    Op->Mem.AddrMode = AddrMode;
-    Op->Mem.BaseRegNum = BaseRegNum;
-    Op->Mem.OffsetIsReg = OffsetIsReg;
-    if (OffsetIsReg)
-      Op->Mem.Offset.RegNum = OffsetRegNum;
-    else
-      Op->Mem.Offset.Value = Offset;
-    Op->Mem.OffsetRegShifted = OffsetRegShifted;
-    Op->Mem.ShiftType = ShiftType;
-    Op->Mem.ShiftAmount = ShiftAmount;
-    Op->Mem.Preindexed = Preindexed;
-    Op->Mem.Postindexed = Postindexed;
-    Op->Mem.Negative = Negative;
-    Op->Mem.Writeback = Writeback;
+    ARMOperand *Op = new ARMOperand(k_Memory);
+    Op->Memory.BaseRegNum = BaseRegNum;
+    Op->Memory.OffsetImm = OffsetImm;
+    Op->Memory.OffsetRegNum = OffsetRegNum;
+    Op->Memory.ShiftType = ShiftType;
+    Op->Memory.ShiftImm = ShiftImm;
+    Op->Memory.Alignment = Alignment;
+    Op->Memory.isNegative = isNegative;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
+  static ARMOperand *CreatePostIdxReg(unsigned RegNum, bool isAdd,
+                                      ARM_AM::ShiftOpc ShiftTy,
+                                      unsigned ShiftImm,
+                                      SMLoc S, SMLoc E) {
+    ARMOperand *Op = new ARMOperand(k_PostIndexRegister);
+    Op->PostIdxReg.RegNum = RegNum;
+    Op->PostIdxReg.isAdd = isAdd;
+    Op->PostIdxReg.ShiftTy = ShiftTy;
+    Op->PostIdxReg.ShiftImm = ShiftImm;
     Op->StartLoc = S;
     Op->EndLoc = E;
     return Op;
   }
 
   static ARMOperand *CreateMemBarrierOpt(ARM_MB::MemBOpt Opt, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(MemBarrierOpt);
+    ARMOperand *Op = new ARMOperand(k_MemBarrierOpt);
     Op->MBOpt.Val = Opt;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -902,7 +1648,7 @@ public:
   }
 
   static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(ProcIFlags);
+    ARMOperand *Op = new ARMOperand(k_ProcIFlags);
     Op->IFlags.Val = IFlags;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -910,7 +1656,7 @@ public:
   }
 
   static ARMOperand *CreateMSRMask(unsigned MMask, SMLoc S) {
-    ARMOperand *Op = new ARMOperand(MSRMask);
+    ARMOperand *Op = new ARMOperand(k_MSRMask);
     Op->MMask.Val = MMask;
     Op->StartLoc = S;
     Op->EndLoc = S;
@@ -922,53 +1668,56 @@ public:
 
 void ARMOperand::print(raw_ostream &OS) const {
   switch (Kind) {
-  case CondCode:
+  case k_FPImmediate:
+    OS << "<fpimm " << getFPImm() << "(" << ARM_AM::getFPImmFloat(getFPImm())
+       << ") >";
+    break;
+  case k_CondCode:
     OS << "<ARMCC::" << ARMCondCodeToString(getCondCode()) << ">";
     break;
-  case CCOut:
+  case k_CCOut:
     OS << "<ccout " << getReg() << ">";
     break;
-  case CoprocNum:
+  case k_ITCondMask: {
+    static char MaskStr[][6] = { "()", "(t)", "(e)", "(tt)", "(et)", "(te)",
+      "(ee)", "(ttt)", "(ett)", "(tet)", "(eet)", "(tte)", "(ete)",
+      "(tee)", "(eee)" };
+    assert((ITMask.Mask & 0xf) == ITMask.Mask);
+    OS << "<it-mask " << MaskStr[ITMask.Mask] << ">";
+    break;
+  }
+  case k_CoprocNum:
     OS << "<coprocessor number: " << getCoproc() << ">";
     break;
-  case CoprocReg:
+  case k_CoprocReg:
     OS << "<coprocessor register: " << getCoproc() << ">";
     break;
-  case MSRMask:
+  case k_CoprocOption:
+    OS << "<coprocessor option: " << CoprocOption.Val << ">";
+    break;
+  case k_MSRMask:
     OS << "<mask: " << getMSRMask() << ">";
     break;
-  case Immediate:
+  case k_Immediate:
     getImm()->print(OS);
     break;
-  case MemBarrierOpt:
+  case k_MemBarrierOpt:
     OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
     break;
-  case Memory:
+  case k_Memory:
     OS << "<memory "
-       << "am:" << ARMII::AddrModeToString(getMemAddrMode())
-       << " base:" << getMemBaseRegNum();
-    if (getMemOffsetIsReg()) {
-      OS << " offset:<register " << getMemOffsetRegNum();
-      if (getMemOffsetRegShifted()) {
-        OS << " offset-shift-type:" << getMemShiftType();
-        OS << " offset-shift-amount:" << *getMemShiftAmount();
-      }
-    } else {
-      OS << " offset:" << *getMemOffset();
-    }
-    if (getMemOffsetIsReg())
-      OS << " (offset-is-reg)";
-    if (getMemPreindexed())
-      OS << " (pre-indexed)";
-    if (getMemPostindexed())
-      OS << " (post-indexed)";
-    if (getMemNegative())
-      OS << " (negative)";
-    if (getMemWriteback())
-      OS << " (writeback)";
+       << " base:" << Memory.BaseRegNum;
+    OS << ">";
+    break;
+  case k_PostIndexRegister:
+    OS << "post-idx register " << (PostIdxReg.isAdd ? "" : "-")
+       << PostIdxReg.RegNum;
+    if (PostIdxReg.ShiftTy != ARM_AM::no_shift)
+      OS << ARM_AM::getShiftOpcStr(PostIdxReg.ShiftTy) << " "
+         << PostIdxReg.ShiftImm;
     OS << ">";
     break;
-  case ProcIFlags: {
+  case k_ProcIFlags: {
     OS << "<ARM_PROC::";
     unsigned IFlags = getProcIFlags();
     for (int i=2; i >= 0; --i)
@@ -977,23 +1726,38 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << ">";
     break;
   }
-  case Register:
+  case k_Register:
     OS << "<register " << getReg() << ">";
     break;
-  case Shifter:
-    OS << "<shifter " << ARM_AM::getShiftOpcStr(Shift.ShiftTy) << ">";
+  case k_ShifterImmediate:
+    OS << "<shift " << (ShifterImm.isASR ? "asr" : "lsl")
+       << " #" << ShifterImm.Imm << ">";
+    break;
+  case k_ShiftedRegister:
+    OS << "<so_reg_reg "
+       << RegShiftedReg.SrcReg
+       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedReg.ShiftImm))
+       << ", " << RegShiftedReg.ShiftReg << ", "
+       << ARM_AM::getSORegOffset(RegShiftedReg.ShiftImm)
+       << ">";
     break;
-  case ShiftedRegister:
-    OS << "<so_reg"
-       << ShiftedReg.SrcReg
-       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(ShiftedReg.ShiftImm))
-       << ", " << ShiftedReg.ShiftReg << ", "
-       << ARM_AM::getSORegOffset(ShiftedReg.ShiftImm)
+  case k_ShiftedImmediate:
+    OS << "<so_reg_imm "
+       << RegShiftedImm.SrcReg
+       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedImm.ShiftImm))
+       << ", " << ARM_AM::getSORegOffset(RegShiftedImm.ShiftImm)
        << ">";
     break;
-  case RegisterList:
-  case DPRRegisterList:
-  case SPRRegisterList: {
+  case k_RotateImmediate:
+    OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
+    break;
+  case k_BitfieldDescriptor:
+    OS << "<bitfield " << "lsb: " << Bitfield.LSB
+       << ", width: " << Bitfield.Width << ">";
+    break;
+  case k_RegisterList:
+  case k_DPRRegisterList:
+  case k_SPRRegisterList: {
     OS << "<register_list ";
 
     const SmallVectorImpl<unsigned> &RegList = getRegList();
@@ -1006,9 +1770,12 @@ void ARMOperand::print(raw_ostream &OS) const {
     OS << ">";
     break;
   }
-  case Token:
+  case k_Token:
     OS << "'" << getToken() << "'";
     break;
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
   }
 }
 
@@ -1021,7 +1788,7 @@ static unsigned MatchRegisterName(StringRef Name);
 
 bool ARMAsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
-  RegNo = TryParseRegister();
+  RegNo = tryParseRegister();
 
   return (RegNo == (unsigned)-1);
 }
@@ -1030,9 +1797,9 @@ bool ARMAsmParser::ParseRegister(unsigned &RegNo,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
 ///
-int ARMAsmParser::TryParseRegister() {
+int ARMAsmParser::tryParseRegister() {
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (Tok.isNot(AsmToken::Identifier)) return -1;
 
   // FIXME: Validate register for the current architecture; we have to do
   // validation later, so maybe there is no need for this here.
@@ -1050,6 +1817,39 @@ int ARMAsmParser::TryParseRegister() {
   if (!RegNum) return -1;
 
   Parser.Lex(); // Eat identifier token.
+
+#if 0
+  // Also check for an index operand. This is only legal for vector registers,
+  // but that'll get caught OK in operand matching, so we don't need to
+  // explicitly filter everything else out here.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = Parser.getTok().getLoc();
+    if (getParser().ParseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return MatchOperand_ParseFail;
+    }
+
+    SMLoc E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+                                                     SIdx, E,
+                                                     getContext()));
+  }
+#endif
+
   return RegNum;
 }
 
@@ -1058,7 +1858,7 @@ int ARMAsmParser::TryParseRegister() {
 // occurs, return -1. An irrecoverable error is one where tokens have been
 // consumed in the process of trying to parse the shifter (i.e., when it is
 // indeed a shifter operand, but malformed).
-int ARMAsmParser::TryParseShiftRegister(
+int ARMAsmParser::tryParseShiftRegister(
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
@@ -1120,7 +1920,7 @@ int ARMAsmParser::TryParseShiftRegister(
         return -1;
       }
     } else if (Parser.getTok().is(AsmToken::Identifier)) {
-      ShiftReg = TryParseRegister();
+      ShiftReg = tryParseRegister();
       SMLoc L = Parser.getTok().getLoc();
       if (ShiftReg == -1) {
         Error (L, "expected immediate or register in shift operand");
@@ -1133,8 +1933,12 @@ int ARMAsmParser::TryParseShiftRegister(
     }
   }
 
-  Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
-                                                       ShiftReg, Imm,
+  if (ShiftReg && ShiftTy != ARM_AM::rrx)
+    Operands.push_back(ARMOperand::CreateShiftedRegister(ShiftTy, SrcReg,
+                                                         ShiftReg, Imm,
+                                               S, Parser.getTok().getLoc()));
+  else
+    Operands.push_back(ARMOperand::CreateShiftedImmediate(ShiftTy, SrcReg, Imm,
                                                S, Parser.getTok().getLoc()));
 
   return 0;
@@ -1148,9 +1952,9 @@ int ARMAsmParser::TryParseShiftRegister(
 /// TODO this is likely to change to allow different register types and or to
 /// parse for a specific register type.
 bool ARMAsmParser::
-TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+tryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
-  int RegNo = TryParseRegister();
+  int RegNo = tryParseRegister();
   if (RegNo == -1)
     return true;
 
@@ -1161,6 +1965,37 @@ TryParseRegisterWithWriteBack(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Operands.push_back(ARMOperand::CreateToken(ExclaimTok.getString(),
                                                ExclaimTok.getLoc()));
     Parser.Lex(); // Eat exclaim token
+    return false;
+  }
+
+  // Also check for an index operand. This is only legal for vector registers,
+  // but that'll get caught OK in operand matching, so we don't need to
+  // explicitly filter everything else out here.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = Parser.getTok().getLoc();
+    if (getParser().ParseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return MatchOperand_ParseFail;
+    }
+
+    SMLoc E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return MatchOperand_ParseFail;
+    }
+
+    Parser.Lex(); // Eat right bracket token.
+
+    Operands.push_back(ARMOperand::CreateVectorIndex(MCE->getValue(),
+                                                     SIdx, E,
+                                                     getContext()));
   }
 
   return false;
@@ -1209,14 +2044,50 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
   return -1;
 }
 
-/// tryParseCoprocNumOperand - Try to parse an coprocessor number operand. The
+/// parseITCondCode - Try to parse a condition code for an IT instruction.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseITCondCode(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  unsigned CC = StringSwitch<unsigned>(Tok.getString())
+    .Case("eq", ARMCC::EQ)
+    .Case("ne", ARMCC::NE)
+    .Case("hs", ARMCC::HS)
+    .Case("cs", ARMCC::HS)
+    .Case("lo", ARMCC::LO)
+    .Case("cc", ARMCC::LO)
+    .Case("mi", ARMCC::MI)
+    .Case("pl", ARMCC::PL)
+    .Case("vs", ARMCC::VS)
+    .Case("vc", ARMCC::VC)
+    .Case("hi", ARMCC::HI)
+    .Case("ls", ARMCC::LS)
+    .Case("ge", ARMCC::GE)
+    .Case("lt", ARMCC::LT)
+    .Case("gt", ARMCC::GT)
+    .Case("le", ARMCC::LE)
+    .Case("al", ARMCC::AL)
+    .Default(~0U);
+  if (CC == ~0U)
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the token.
+
+  Operands.push_back(ARMOperand::CreateCondCode(ARMCC::CondCodes(CC), S));
+
+  return MatchOperand_Success;
+}
+
+/// parseCoprocNumOperand - Try to parse an coprocessor number operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
 
   int Num = MatchCoprocessorOperandName(Tok.getString(), 'p');
   if (Num == -1)
@@ -1227,14 +2098,15 @@ tryParseCoprocNumOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseCoprocRegOperand - Try to parse an coprocessor register operand. The
+/// parseCoprocRegOperand - Try to parse an coprocessor register operand. The
 /// token must be an Identifier when called, and if it is a coprocessor
 /// number, the token is eaten and the operand is added to the operand list.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
 
   int Reg = MatchCoprocessorOperandName(Tok.getString(), 'c');
   if (Reg == -1)
@@ -1245,93 +2117,155 @@ tryParseCoprocRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// Parse a register list, return it if successful else return null.  The first
-/// token must be a '{' when called.
-bool ARMAsmParser::
-ParseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  assert(Parser.getTok().is(AsmToken::LCurly) &&
-         "Token is not a Left Curly Brace");
+/// parseCoprocOptionOperand - Try to parse an coprocessor option operand.
+/// coproc_option : '{' imm0_255 '}'
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseCoprocOptionOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
 
-  // Read the rest of the registers in the list.
-  unsigned PrevRegNum = 0;
-  SmallVector<std::pair<unsigned, SMLoc>, 32> Registers;
-
-  do {
-    bool IsRange = Parser.getTok().is(AsmToken::Minus);
-    Parser.Lex(); // Eat non-identifier token.
-
-    const AsmToken &RegTok = Parser.getTok();
-    SMLoc RegLoc = RegTok.getLoc();
-    if (RegTok.isNot(AsmToken::Identifier)) {
-      Error(RegLoc, "register expected");
-      return true;
-    }
-
-    int RegNum = TryParseRegister();
-    if (RegNum == -1) {
-      Error(RegLoc, "register expected");
-      return true;
-    }
-
-    if (IsRange) {
-      int Reg = PrevRegNum;
-      do {
-        ++Reg;
-        Registers.push_back(std::make_pair(Reg, RegLoc));
-      } while (Reg != RegNum);
-    } else {
-      Registers.push_back(std::make_pair(RegNum, RegLoc));
-    }
-
-    PrevRegNum = RegNum;
-  } while (Parser.getTok().is(AsmToken::Comma) ||
-           Parser.getTok().is(AsmToken::Minus));
+  // If this isn't a '{', this isn't a coprocessor immediate operand.
+  if (Parser.getTok().isNot(AsmToken::LCurly))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '{'
 
-  // Process the right curly brace of the list.
-  const AsmToken &RCurlyTok = Parser.getTok();
-  if (RCurlyTok.isNot(AsmToken::RCurly)) {
-    Error(RCurlyTok.getLoc(), "'}' expected");
-    return true;
+  const MCExpr *Expr;
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(Expr)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
   }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+  if (!CE || CE->getValue() < 0 || CE->getValue() > 255) {
+    Error(Loc, "coprocessor option must be an immediate in range [0, 255]");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
 
-  SMLoc E = RCurlyTok.getLoc();
-  Parser.Lex(); // Eat right curly brace token.
-
-  // Verify the register list.
-  SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
-    RI = Registers.begin(), RE = Registers.end();
+  // Check for and consume the closing '}'
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return MatchOperand_ParseFail;
+  SMLoc E = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat the '}'
 
-  unsigned HighRegNum = getARMRegisterNumbering(RI->first);
-  bool EmittedWarning = false;
+  Operands.push_back(ARMOperand::CreateCoprocOption(Val, S, E));
+  return MatchOperand_Success;
+}
 
-  DenseMap<unsigned, bool> RegMap;
-  RegMap[HighRegNum] = true;
+// For register list parsing, we need to map from raw GPR register numbering
+// to the enumeration values. The enumeration values aren't sorted by
+// register number due to our using "sp", "lr" and "pc" as canonical names.
+static unsigned getNextRegister(unsigned Reg) {
+  // If this is a GPR, we need to do it manually, otherwise we can rely
+  // on the sort ordering of the enumeration since the other reg-classes
+  // are sane.
+  if (!ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    return Reg + 1;
+  switch(Reg) {
+  default: assert(0 && "Invalid GPR number!");
+  case ARM::R0:  return ARM::R1;  case ARM::R1:  return ARM::R2;
+  case ARM::R2:  return ARM::R3;  case ARM::R3:  return ARM::R4;
+  case ARM::R4:  return ARM::R5;  case ARM::R5:  return ARM::R6;
+  case ARM::R6:  return ARM::R7;  case ARM::R7:  return ARM::R8;
+  case ARM::R8:  return ARM::R9;  case ARM::R9:  return ARM::R10;
+  case ARM::R10: return ARM::R11; case ARM::R11: return ARM::R12;
+  case ARM::R12: return ARM::SP;  case ARM::SP:  return ARM::LR;
+  case ARM::LR:  return ARM::PC;  case ARM::PC:  return ARM::R0;
+  }
+}
 
-  for (++RI; RI != RE; ++RI) {
-    const std::pair<unsigned, SMLoc> &RegInfo = *RI;
-    unsigned Reg = getARMRegisterNumbering(RegInfo.first);
+/// Parse a register list.
+bool ARMAsmParser::
+parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) &&
+         "Token is not a Left Curly Brace");
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+  SMLoc RegLoc = Parser.getTok().getLoc();
 
-    if (RegMap[Reg]) {
-      Error(RegInfo.second, "register duplicated in register list");
-      return true;
+  // Check the first register in the list to see what register class
+  // this is a list of.
+  int Reg = tryParseRegister();
+  if (Reg == -1)
+    return Error(RegLoc, "register expected");
+
+  MCRegisterClass *RC;
+  if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::GPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::DPRRegClassID];
+  else if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
+    RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
+  else
+    return Error(RegLoc, "invalid register in register list");
+
+  // The reglist instructions have at most 16 registers, so reserve
+  // space for that many.
+  SmallVector<std::pair<unsigned, SMLoc>, 16> Registers;
+  // Store the first register.
+  Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+
+  // This starts immediately after the first register token in the list,
+  // so we can see either a comma or a minus (range separator) as a legal
+  // next token.
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      Parser.Lex(); // Eat the comma.
+      SMLoc EndLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1)
+        return Error(EndLoc, "register expected");
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!RC->contains(EndReg))
+        return Error(EndLoc, "invalid register in register list");
+      // Ranges must go from low to high.
+      if (getARMRegisterNumbering(Reg) > getARMRegisterNumbering(EndReg))
+        return Error(EndLoc, "bad range in register list");
+
+      // Add all the registers in the range to the register list.
+      while (Reg != EndReg) {
+        Reg = getNextRegister(Reg);
+        Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+      }
+      continue;
     }
-
-    if (!EmittedWarning && Reg < HighRegNum)
-      Warning(RegInfo.second,
-              "register not in ascending order in register list");
-
-    RegMap[Reg] = true;
-    HighRegNum = std::max(Reg, HighRegNum);
+    Parser.Lex(); // Eat the comma.
+    RegLoc = Parser.getTok().getLoc();
+    int OldReg = Reg;
+    Reg = tryParseRegister();
+    if (Reg == -1)
+      return Error(RegLoc, "register expected");
+    // The register must be in the same register class as the first.
+    if (!RC->contains(Reg))
+      return Error(RegLoc, "invalid register in register list");
+    // List must be monotonically increasing.
+    if (getARMRegisterNumbering(Reg) <= getARMRegisterNumbering(OldReg))
+      return Error(RegLoc, "register list not in ascending order");
+    // VFP register lists must also be contiguous.
+    // It's OK to use the enumeration values directly here rather, as the
+    // VFP register classes have the enum sorted properly.
+    if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
+        Reg != OldReg + 1)
+      return Error(RegLoc, "non-contiguous register range");
+    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
   }
 
+  SMLoc E = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(E, "'}' expected");
+  Parser.Lex(); // Eat '}' token.
+
   Operands.push_back(ARMOperand::CreateRegList(Registers, S, E));
   return false;
 }
 
-/// tryParseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
+/// parseMemBarrierOptOperand - Try to parse DSB/DMB data barrier options.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
@@ -1360,28 +2294,32 @@ tryParseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseProcIFlagsOperand - Try to parse iflags from CPS instruction.
+/// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
   StringRef IFlagsStr = Tok.getString();
 
+  // An iflags string of "none" is interpreted to mean that none of the AIF
+  // bits are set.  Not a terribly useful instruction, but a valid encoding.
   unsigned IFlags = 0;
-  for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
-    unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
-    .Case("a", ARM_PROC::A)
-    .Case("i", ARM_PROC::I)
-    .Case("f", ARM_PROC::F)
-    .Default(~0U);
-
-    // If some specific iflag is already set, it means that some letter is
-    // present more than once, this is not acceptable.
-    if (Flag == ~0U || (IFlags & Flag))
-      return MatchOperand_NoMatch;
+  if (IFlagsStr != "none") {
+        for (int i = 0, e = IFlagsStr.size(); i != e; ++i) {
+      unsigned Flag = StringSwitch<unsigned>(IFlagsStr.substr(i, 1))
+        .Case("a", ARM_PROC::A)
+        .Case("i", ARM_PROC::I)
+        .Case("f", ARM_PROC::F)
+        .Default(~0U);
+
+      // If some specific iflag is already set, it means that some letter is
+      // present more than once, this is not acceptable.
+      if (Flag == ~0U || (IFlags & Flag))
+        return MatchOperand_NoMatch;
 
-    IFlags |= Flag;
+      IFlags |= Flag;
+    }
   }
 
   Parser.Lex(); // Eat identifier token.
@@ -1389,18 +2327,49 @@ tryParseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseMSRMaskOperand - Try to parse mask flags from MSR instruction.
+/// parseMSRMaskOperand - Try to parse mask flags from MSR instruction.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
   StringRef Mask = Tok.getString();
 
+  if (isMClass()) {
+    // See ARMv6-M 10.1.1
+    unsigned FlagsVal = StringSwitch<unsigned>(Mask)
+      .Case("apsr", 0)
+      .Case("iapsr", 1)
+      .Case("eapsr", 2)
+      .Case("xpsr", 3)
+      .Case("ipsr", 5)
+      .Case("epsr", 6)
+      .Case("iepsr", 7)
+      .Case("msp", 8)
+      .Case("psp", 9)
+      .Case("primask", 16)
+      .Case("basepri", 17)
+      .Case("basepri_max", 18)
+      .Case("faultmask", 19)
+      .Case("control", 20)
+      .Default(~0U);
+    
+    if (FlagsVal == ~0U)
+      return MatchOperand_NoMatch;
+
+    if (!hasV7Ops() && FlagsVal >= 17 && FlagsVal <= 19)
+      // basepri, basepri_max and faultmask only valid for V7m.
+      return MatchOperand_NoMatch;
+    
+    Parser.Lex(); // Eat identifier token.
+    Operands.push_back(ARMOperand::CreateMSRMask(FlagsVal, S));
+    return MatchOperand_Success;
+  }
+
   // Split spec_reg from flag, example: CPSR_sxf => "CPSR" and "sxf"
   size_t Start = 0, Next = Mask.find('_');
   StringRef Flags = "";
-  StringRef SpecReg = Mask.slice(Start, Next);
+  std::string SpecReg = LowercaseString(Mask.slice(Start, Next));
   if (Next != StringRef::npos)
     Flags = Mask.slice(Next+1, Mask.size());
 
@@ -1411,7 +2380,7 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   if (SpecReg == "apsr") {
     FlagsVal = StringSwitch<unsigned>(Flags)
-    .Case("nzcvq",  0x8) // same as CPSR_c
+    .Case("nzcvq",  0x8) // same as CPSR_f
     .Case("g",      0x4) // same as CPSR_s
     .Case("nzcvqg", 0xc) // same as CPSR_fs
     .Default(~0U);
@@ -1420,7 +2389,7 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       if (!Flags.empty())
         return MatchOperand_NoMatch;
       else
-        FlagsVal = 0; // No flag
+        FlagsVal = 8; // No flag
     }
   } else if (SpecReg == "cpsr" || SpecReg == "spsr") {
     if (Flags == "all") // cpsr_all is an alias for cpsr_fc
@@ -1455,96 +2424,680 @@ tryParseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// tryParseMemMode2Operand - Try to parse memory addressing mode 2 operand.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMemMode2Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\"");
+parsePKHImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands, StringRef Op,
+            int Low, int High) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  std::string LowerOp = LowercaseString(Op);
+  std::string UpperOp = UppercaseString(Op);
+  if (ShiftName != LowerOp && ShiftName != UpperOp) {
+    Error(Parser.getTok().getLoc(), Op + " operand expected.");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat shift type token.
 
-  if (ParseMemory(Operands, ARMII::AddrMode2))
-    return MatchOperand_NoMatch;
+  // There must be a '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(ShiftAmount)) {
+    Error(Loc, "illegal expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(Loc, "constant expression expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = CE->getValue();
+  if (Val < Low || Val > High) {
+    Error(Loc, "immediate value out of range");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreateImm(CE, Loc, Parser.getTok().getLoc()));
 
   return MatchOperand_Success;
 }
 
-/// tryParseMemMode3Operand - Try to parse memory addressing mode 3 operand.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
-tryParseMemMode3Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a \"[\"");
+parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(Tok.getLoc(), "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  int Val = StringSwitch<int>(Tok.getString())
+    .Case("be", 1)
+    .Case("le", 0)
+    .Default(-1);
+  Parser.Lex(); // Eat the token.
+
+  if (Val == -1) {
+    Error(Tok.getLoc(), "'be' or 'le' operand expected");
+    return MatchOperand_ParseFail;
+  }
+  Operands.push_back(ARMOperand::CreateImm(MCConstantExpr::Create(Val,
+                                                                  getContext()),
+                                           S, Parser.getTok().getLoc()));
+  return MatchOperand_Success;
+}
+
+/// parseShifterImm - Parse the shifter immediate operand for SSAT/USAT
+/// instructions. Legal values are:
+///     lsl #n  'n' in [0,31]
+///     asr #n  'n' in [1,32]
+///             n == 32 encoded as n == 0.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseShifterImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier)) {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  StringRef ShiftName = Tok.getString();
+  bool isASR;
+  if (ShiftName == "lsl" || ShiftName == "LSL")
+    isASR = false;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    isASR = true;
+  else {
+    Error(S, "shift operator 'asr' or 'lsl' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a shift amount.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(ShiftAmount)) {
+    Error(E, "malformed shift expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(E, "shift amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
 
-  if (ParseMemory(Operands, ARMII::AddrMode3))
+  int64_t Val = CE->getValue();
+  if (isASR) {
+    // Shift amount must be in [1,32]
+    if (Val < 1 || Val > 32) {
+      Error(E, "'asr' shift amount must be in range [1,32]");
+      return MatchOperand_ParseFail;
+    }
+    // asr #32 encoded as asr #0, but is not allowed in Thumb2 mode.
+    if (isThumb() && Val == 32) {
+      Error(E, "'asr #32' shift amount not allowed in Thumb mode");
+      return MatchOperand_ParseFail;
+    }
+    if (Val == 32) Val = 0;
+  } else {
+    // Shift amount must be in [1,32]
+    if (Val < 0 || Val > 31) {
+      Error(E, "'lsr' shift amount must be in range [0,31]");
+      return MatchOperand_ParseFail;
+    }
+  }
+
+  E = Parser.getTok().getLoc();
+  Operands.push_back(ARMOperand::CreateShifterImm(isASR, Val, S, E));
+
+  return MatchOperand_Success;
+}
+
+/// parseRotImm - Parse the shifter immediate operand for SXTB/UXTB family
+/// of instructions. Legal values are:
+///     ror #n  'n' in {0, 8, 16, 24}
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseRotImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+  StringRef ShiftName = Tok.getString();
+  if (ShiftName != "ror" && ShiftName != "ROR")
     return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the operator.
+
+  // A '#' and a rotate amount.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *ShiftAmount;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(ShiftAmount)) {
+    Error(E, "malformed rotate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ShiftAmount);
+  if (!CE) {
+    Error(E, "rotate amount must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Val = CE->getValue();
+  // Shift amount must be in {0, 8, 16, 24} (0 is undocumented extension)
+  // normally, zero is represented in asm by omitting the rotate operand
+  // entirely.
+  if (Val != 8 && Val != 16 && Val != 24 && Val != 0) {
+    Error(E, "'ror' rotate amount must be 8, 16, or 24");
+    return MatchOperand_ParseFail;
+  }
+
+  E = Parser.getTok().getLoc();
+  Operands.push_back(ARMOperand::CreateRotImm(Val, S, E));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseBitfield(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  // The bitfield descriptor is really two operands, the LSB and the width.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *LSBExpr;
+  SMLoc E = Parser.getTok().getLoc();
+  if (getParser().ParseExpression(LSBExpr)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(LSBExpr);
+  if (!CE) {
+    Error(E, "'lsb' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t LSB = CE->getValue();
+  // The LSB must be in the range [0,31]
+  if (LSB < 0 || LSB > 31) {
+    Error(E, "'lsb' operand must be in the range [0,31]");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getLoc();
+
+  // Expect another immediate operand.
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Error(Parser.getTok().getLoc(), "too few operands");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+  if (Parser.getTok().isNot(AsmToken::Hash)) {
+    Error(Parser.getTok().getLoc(), "'#' expected");
+    return MatchOperand_ParseFail;
+  }
+  Parser.Lex(); // Eat hash token.
+
+  const MCExpr *WidthExpr;
+  if (getParser().ParseExpression(WidthExpr)) {
+    Error(E, "malformed immediate expression");
+    return MatchOperand_ParseFail;
+  }
+  CE = dyn_cast<MCConstantExpr>(WidthExpr);
+  if (!CE) {
+    Error(E, "'width' operand must be an immediate");
+    return MatchOperand_ParseFail;
+  }
+
+  int64_t Width = CE->getValue();
+  // The LSB must be in the range [1,32-lsb]
+  if (Width < 1 || Width > 32 - LSB) {
+    Error(E, "'width' operand must be in the range [1,32-lsb]");
+    return MatchOperand_ParseFail;
+  }
+  E = Parser.getTok().getLoc();
+
+  Operands.push_back(ARMOperand::CreateBitfield(LSB, Width, S, E));
 
   return MatchOperand_Success;
 }
 
-/// CvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parsePostIdxReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // postidx_reg := '+' register {, shift}
+  //              | '-' register {, shift}
+  //              | register {, shift}
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  bool haveEaten = false;
+  bool isAdd = true;
+  int Reg = -1;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+  if (Parser.getTok().is(AsmToken::Identifier))
+    Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Parser.getTok().getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc E = Parser.getTok().getLoc();
+
+  ARM_AM::ShiftOpc ShiftTy = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftTy, ShiftImm))
+      return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ShiftTy,
+                                                  ShiftImm, S, E));
+
+  return MatchOperand_Success;
+}
+
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Check for a post-index addressing register operand. Specifically:
+  // am3offset := '+' register
+  //              | '-' register
+  //              | register
+  //              | # imm
+  //              | # + imm
+  //              | # - imm
+
+  // This method must return MatchOperand_NoMatch without consuming any tokens
+  // in the case where there is no match, as other alternatives take other
+  // parse methods.
+  AsmToken Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+
+  // Do immediates first, as we always parse those if we have a '#'.
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat the '#'.
+    // Explicitly look for a '-', as we need to encode negative zero
+    // differently.
+    bool isNegative = Parser.getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    if (getParser().ParseExpression(Offset))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE) {
+      Error(S, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+    SMLoc E = Tok.getLoc();
+    // Negative zero is encoded as the flag value INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      Val = INT32_MIN;
+
+    Operands.push_back(
+      ARMOperand::CreateImm(MCConstantExpr::Create(Val, getContext()), S, E));
+
+    return MatchOperand_Success;
+  }
+
+
+  bool haveEaten = false;
+  bool isAdd = true;
+  int Reg = -1;
+  if (Tok.is(AsmToken::Plus)) {
+    Parser.Lex(); // Eat the '+' token.
+    haveEaten = true;
+  } else if (Tok.is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the '-' token.
+    isAdd = false;
+    haveEaten = true;
+  }
+  if (Parser.getTok().is(AsmToken::Identifier))
+    Reg = tryParseRegister();
+  if (Reg == -1) {
+    if (!haveEaten)
+      return MatchOperand_NoMatch;
+    Error(Parser.getTok().getLoc(), "register expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc E = Parser.getTok().getLoc();
+
+  Operands.push_back(ARMOperand::CreatePostIdxReg(Reg, isAdd, ARM_AM::no_shift,
+                                                  0, S, E));
+
+  return MatchOperand_Success;
+}
+
+/// cvtT2LdrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
+             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateReg(0));
+  // addr
+  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtT2StrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
+             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateReg(0));
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
 
-  ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
 }
 
-/// CvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
 }
 
-/// CvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
 
-  ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+
+/// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
 }
 
-/// CvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
 bool ARMAsmParser::
-CvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
+  // offset
+  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtLdrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdrdPre(MCInst &Inst, unsigned Opcode,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // addr
+  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
+  // pred
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtStrdPre - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtStrdPre(MCInst &Inst, unsigned Opcode,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  // Rt, Rt2
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  // addr
+  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
+  // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
   return true;
 }
 
+/// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
+  // Create a writeback register dummy placeholder.
+  Inst.addOperand(MCOperand::CreateImm(0));
+  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
+  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+  return true;
+}
+
+/// cvtThumbMultiple- Convert parsed operands to MCInst.
+/// Needed here because the Asm Gen Matcher can't handle properly tied operands
+/// when they refer multiple MIOperands inside a single one.
+bool ARMAsmParser::
+cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // The second source operand must be the same register as the destination
+  // operand.
+  if (Operands.size() == 6 &&
+      (((ARMOperand*)Operands[3])->getReg() !=
+       ((ARMOperand*)Operands[5])->getReg()) &&
+      (((ARMOperand*)Operands[3])->getReg() !=
+       ((ARMOperand*)Operands[4])->getReg())) {
+    Error(Operands[3]->getStartLoc(),
+          "destination register must match source register");
+    return false;
+  }
+  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
+  ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
+  ((ARMOperand*)Operands[4])->addRegOperands(Inst, 1);
+  // If we have a three-operand form, use that, else the second source operand
+  // is just the destination operand again.
+  if (Operands.size() == 6)
+    ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
+  else
+    Inst.addOperand(Inst.getOperand(0));
+  ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
+
+  return true;
+}
+
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
-///
-/// TODO Only preindexing and postindexing addressing are started, unindexed
-/// with option, etc are still to do.
 bool ARMAsmParser::
-ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-            ARMII::AddrMode AddrMode = ARMII::AddrModeNone) {
+parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S, E;
   assert(Parser.getTok().is(AsmToken::LBrac) &&
          "Token is not a Left Bracket");
@@ -1552,185 +3105,178 @@ ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   Parser.Lex(); // Eat left bracket token.
 
   const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier)) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return true;
-  }
-  int BaseRegNum = TryParseRegister();
-  if (BaseRegNum == -1) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return true;
-  }
+  int BaseRegNum = tryParseRegister();
+  if (BaseRegNum == -1)
+    return Error(BaseRegTok.getLoc(), "register expected");
 
   // The next token must either be a comma or a closing bracket.
   const AsmToken &Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Comma) && !Tok.is(AsmToken::RBrac))
-    return true;
+    return Error(Tok.getLoc(), "malformed memory operand");
 
-  bool Preindexed = false;
-  bool Postindexed = false;
-  bool OffsetIsReg = false;
-  bool Negative = false;
-  bool Writeback = false;
-  ARMOperand *WBOp = 0;
-  int OffsetRegNum = -1;
-  bool OffsetRegShifted = false;
-  enum ARM_AM::ShiftOpc ShiftType = ARM_AM::lsl;
-  const MCExpr *ShiftAmount = 0;
-  const MCExpr *Offset = 0;
-
-  // First look for preindexed address forms, that is after the "[Rn" we now
-  // have to see if the next token is a comma.
-  if (Tok.is(AsmToken::Comma)) {
-    Preindexed = true;
-    Parser.Lex(); // Eat comma token.
-
-    if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount,
-                             Offset, OffsetIsReg, OffsetRegNum, E))
-      return true;
-    const AsmToken &RBracTok = Parser.getTok();
-    if (RBracTok.isNot(AsmToken::RBrac)) {
-      Error(RBracTok.getLoc(), "']' expected");
-      return true;
-    }
-    E = RBracTok.getLoc();
+  if (Tok.is(AsmToken::RBrac)) {
+    E = Tok.getLoc();
     Parser.Lex(); // Eat right bracket token.
 
-    const AsmToken &ExclaimTok = Parser.getTok();
-    if (ExclaimTok.is(AsmToken::Exclaim)) {
-      // None of addrmode3 instruction uses "!"
-      if (AddrMode == ARMII::AddrMode3)
-        return true;
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0, ARM_AM::no_shift,
+                                             0, 0, false, S, E));
 
-      WBOp = ARMOperand::CreateToken(ExclaimTok.getString(),
-                                     ExclaimTok.getLoc());
-      Writeback = true;
-      Parser.Lex(); // Eat exclaim token
-    } else { // In addressing mode 2, pre-indexed mode always end with "!"
-      if (AddrMode == ARMII::AddrMode2)
-        Preindexed = false;
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand. It's rather odd, but syntactically valid.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
     }
-  } else {
-    // The "[Rn" we have so far was not followed by a comma.
 
-    // If there's anything other than the right brace, this is a post indexing
-    // addressing form.
-    E = Tok.getLoc();
-    Parser.Lex(); // Eat right bracket token.
+    return false;
+  }
 
-    const AsmToken &NextTok = Parser.getTok();
+  assert(Tok.is(AsmToken::Comma) && "Lost comma in memory operand?!");
+  Parser.Lex(); // Eat the comma.
 
-    if (NextTok.isNot(AsmToken::EndOfStatement)) {
-      Postindexed = true;
-      Writeback = true;
+  // If we have a ':', it's an alignment specifier.
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat the ':'.
+    E = Parser.getTok().getLoc();
 
-      if (NextTok.isNot(AsmToken::Comma)) {
-        Error(NextTok.getLoc(), "',' expected");
-        return true;
-      }
-
-      Parser.Lex(); // Eat comma token.
+    const MCExpr *Expr;
+    if (getParser().ParseExpression(Expr))
+     return true;
 
-      if (ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType,
-                               ShiftAmount, Offset, OffsetIsReg, OffsetRegNum,
-                               E))
-        return true;
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    unsigned Align = 0;
+    switch (CE->getValue()) {
+    default:
+      return Error(E, "alignment specifier must be 64, 128, or 256 bits");
+    case 64:  Align = 8; break;
+    case 128: Align = 16; break;
+    case 256: Align = 32; break;
     }
-  }
 
-  // Force Offset to exist if used.
-  if (!OffsetIsReg) {
-    if (!Offset)
-      Offset = MCConstantExpr::Create(0, getContext());
-  } else {
-    if (AddrMode == ARMII::AddrMode3 && OffsetRegShifted) {
-      Error(E, "shift amount not supported");
-      return true;
+    // Now we should have the closing ']'
+    E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(E, "']' expected");
+    Parser.Lex(); // Eat right bracket token.
+
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0,
+                                             ARM_AM::no_shift, 0, Align,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
     }
+
+    return false;
   }
 
-  Operands.push_back(ARMOperand::CreateMem(AddrMode, BaseRegNum, OffsetIsReg,
-                                     Offset, OffsetRegNum, OffsetRegShifted,
-                                     ShiftType, ShiftAmount, Preindexed,
-                                     Postindexed, Negative, Writeback, S, E));
-  if (WBOp)
-    Operands.push_back(WBOp);
+  // If we have a '#', it's an immediate offset, else assume it's a register
+  // offset.
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat the '#'.
+    E = Parser.getTok().getLoc();
 
-  return false;
-}
+    bool isNegative = getParser().getTok().is(AsmToken::Minus);
+    const MCExpr *Offset;
+    if (getParser().ParseExpression(Offset))
+     return true;
+
+    // The expression has to be a constant. Memory references with relocations
+    // don't come through here, as they use the <label> forms of the relevant
+    // instructions.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Offset);
+    if (!CE)
+      return Error (E, "constant expression expected");
+
+    // If the constant was #-0, represent it as INT32_MIN.
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      CE = MCConstantExpr::Create(INT32_MIN, getContext());
+
+    // Now we should have the closing ']'
+    E = Parser.getTok().getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return Error(E, "']' expected");
+    Parser.Lex(); // Eat right bracket token.
 
-/// Parse the offset of a memory operand after we have seen "[Rn," or "[Rn],"
-/// we will parse the following (were +/- means that a plus or minus is
-/// optional):
-///   +/-Rm
-///   +/-Rm, shift
-///   #offset
-/// we return false on success or an error otherwise.
-bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
-                                        bool &OffsetRegShifted,
-                                        enum ARM_AM::ShiftOpc &ShiftType,
-                                        const MCExpr *&ShiftAmount,
-                                        const MCExpr *&Offset,
-                                        bool &OffsetIsReg,
-                                        int &OffsetRegNum,
-                                        SMLoc &E) {
-  Negative = false;
-  OffsetRegShifted = false;
-  OffsetIsReg = false;
-  OffsetRegNum = -1;
-  const AsmToken &NextTok = Parser.getTok();
-  E = NextTok.getLoc();
-  if (NextTok.is(AsmToken::Plus))
-    Parser.Lex(); // Eat plus token.
-  else if (NextTok.is(AsmToken::Minus)) {
-    Negative = true;
-    Parser.Lex(); // Eat minus token
-  }
-  // See if there is a register following the "[Rn," or "[Rn]," we have so far.
-  const AsmToken &OffsetRegTok = Parser.getTok();
-  if (OffsetRegTok.is(AsmToken::Identifier)) {
-    SMLoc CurLoc = OffsetRegTok.getLoc();
-    OffsetRegNum = TryParseRegister();
-    if (OffsetRegNum != -1) {
-      OffsetIsReg = true;
-      E = CurLoc;
+    // Don't worry about range checking the value here. That's handled by
+    // the is*() predicates.
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, CE, 0,
+                                             ARM_AM::no_shift, 0, 0,
+                                             false, S, E));
+
+    // If there's a pre-indexing writeback marker, '!', just add it as a token
+    // operand.
+    if (Parser.getTok().is(AsmToken::Exclaim)) {
+      Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+      Parser.Lex(); // Eat the '!'.
     }
-  }
 
-  // If we parsed a register as the offset then there can be a shift after that.
-  if (OffsetRegNum != -1) {
-    // Look for a comma then a shift
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat comma token.
+    return false;
+  }
 
-      const AsmToken &Tok = Parser.getTok();
-      if (ParseShift(ShiftType, ShiftAmount, E))
-        return Error(Tok.getLoc(), "shift expected");
-      OffsetRegShifted = true;
-    }
+  // The register offset is optionally preceded by a '+' or '-'
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex(); // Eat the '-'.
+  } else if (Parser.getTok().is(AsmToken::Plus)) {
+    // Nothing to do.
+    Parser.Lex(); // Eat the '+'.
   }
-  else { // the "[Rn," or "[Rn,]" we have so far was not followed by "Rm"
-    // Look for #offset following the "[Rn," or "[Rn],"
-    const AsmToken &HashTok = Parser.getTok();
-    if (HashTok.isNot(AsmToken::Hash))
-      return Error(HashTok.getLoc(), "'#' expected");
 
-    Parser.Lex(); // Eat hash token.
+  E = Parser.getTok().getLoc();
+  int OffsetRegNum = tryParseRegister();
+  if (OffsetRegNum == -1)
+    return Error(E, "register expected");
+
+  // If there's a shift operator, handle it.
+  ARM_AM::ShiftOpc ShiftType = ARM_AM::no_shift;
+  unsigned ShiftImm = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the ','.
+    if (parseMemRegOffsetShift(ShiftType, ShiftImm))
+      return true;
+  }
 
-    if (getParser().ParseExpression(Offset))
-     return true;
-    E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  // Now we should have the closing ']'
+  E = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return Error(E, "']' expected");
+  Parser.Lex(); // Eat right bracket token.
+
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, OffsetRegNum,
+                                           ShiftType, ShiftImm, 0, isNegative,
+                                           S, E));
+
+  // If there's a pre-indexing writeback marker, '!', just add it as a token
+  // operand.
+  if (Parser.getTok().is(AsmToken::Exclaim)) {
+    Operands.push_back(ARMOperand::CreateToken("!",Parser.getTok().getLoc()));
+    Parser.Lex(); // Eat the '!'.
   }
+
   return false;
 }
 
-/// ParseShift as one of these two:
+/// parseMemRegOffsetShift - one of these two:
 ///   ( lsl | lsr | asr | ror ) , # shift_amount
 ///   rrx
-/// and returns true if it parses a shift otherwise it returns false.
-bool ARMAsmParser::ParseShift(ARM_AM::ShiftOpc &St,
-                              const MCExpr *&ShiftAmount, SMLoc &E) {
+/// return true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
+                                          unsigned &Amount) {
+  SMLoc Loc = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return true;
@@ -1746,28 +3292,86 @@ bool ARMAsmParser::ParseShift(ARM_AM::ShiftOpc &St,
   else if (ShiftName == "rrx" || ShiftName == "RRX")
     St = ARM_AM::rrx;
   else
-    return true;
+    return Error(Loc, "illegal shift operator");
   Parser.Lex(); // Eat shift type token.
 
-  // Rrx stands alone.
-  if (St == ARM_AM::rrx)
-    return false;
-
-  // Otherwise, there must be a '#' and a shift amount.
-  const AsmToken &HashTok = Parser.getTok();
-  if (HashTok.isNot(AsmToken::Hash))
-    return Error(HashTok.getLoc(), "'#' expected");
-  Parser.Lex(); // Eat hash token.
+  // rrx stands alone.
+  Amount = 0;
+  if (St != ARM_AM::rrx) {
+    Loc = Parser.getTok().getLoc();
+    // A '#' and a shift amount.
+    const AsmToken &HashTok = Parser.getTok();
+    if (HashTok.isNot(AsmToken::Hash))
+      return Error(HashTok.getLoc(), "'#' expected");
+    Parser.Lex(); // Eat hash token.
 
-  if (getParser().ParseExpression(ShiftAmount))
-    return true;
+    const MCExpr *Expr;
+    if (getParser().ParseExpression(Expr))
+      return true;
+    // Range check the immediate.
+    // lsl, ror: 0 <= imm <= 31
+    // lsr, asr: 0 <= imm <= 32
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr);
+    if (!CE)
+      return Error(Loc, "shift amount must be an immediate");
+    int64_t Imm = CE->getValue();
+    if (Imm < 0 ||
+        ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
+        ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
+      return Error(Loc, "immediate shift value out of range");
+    Amount = Imm;
+  }
 
   return false;
 }
 
+/// parseFPImm - A floating point immediate expression operand.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (Parser.getTok().isNot(AsmToken::Hash))
+    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '#'.
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = ARM_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    if (Val == -1) {
+      TokError("floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(ARMOperand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val = Tok.getIntVal();
+    Parser.Lex(); // Eat the token.
+    if (Val > 255 || Val < 0) {
+      TokError("encoded floating point value out of range");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(ARMOperand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
 /// Parse a arm instruction operand.  For now this parses the operand regardless
 /// of the mnemonic.
-bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                 StringRef Mnemonic) {
   SMLoc S, E;
 
@@ -1787,13 +3391,20 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     Error(Parser.getTok().getLoc(), "unexpected token in operand");
     return true;
   case AsmToken::Identifier: {
-    if (!TryParseRegisterWithWriteBack(Operands))
+    // If this is VMRS, check for the apsr_nzcv operand.
+    if (!tryParseRegisterWithWriteBack(Operands))
       return false;
-    int Res = TryParseShiftRegister(Operands);
+    int Res = tryParseShiftRegister(Operands);
     if (Res == 0) // success
       return false;
     else if (Res == -1) // irrecoverable error
       return true;
+    if (Mnemonic == "vmrs" && Parser.getTok().getString() == "apsr_nzcv") {
+      S = Parser.getTok().getLoc();
+      Parser.Lex();
+      Operands.push_back(ARMOperand::CreateToken("apsr_nzcv", S));
+      return false;
+    }
 
     // Fall though for the Identifier case that is not a register or a
     // special name.
@@ -1811,26 +3422,36 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return false;
   }
   case AsmToken::LBrac:
-    return ParseMemory(Operands);
+    return parseMemory(Operands);
   case AsmToken::LCurly:
-    return ParseRegisterList(Operands);
-  case AsmToken::Hash:
+    return parseRegisterList(Operands);
+  case AsmToken::Hash: {
     // #42 -> immediate.
     // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
     S = Parser.getTok().getLoc();
     Parser.Lex();
+    bool isNegative = Parser.getTok().is(AsmToken::Minus);
     const MCExpr *ImmVal;
     if (getParser().ParseExpression(ImmVal))
       return true;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!CE) {
+      Error(S, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+    int32_t Val = CE->getValue();
+    if (isNegative && Val == 0)
+      ImmVal = MCConstantExpr::Create(INT32_MIN, getContext());
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(ImmVal, S, E));
     return false;
+  }
   case AsmToken::Colon: {
     // ":lower16:" and ":upper16:" expression prefixes
     // FIXME: Check it's an expression prefix,
     // e.g. (FOO - :lower16:BAR) isn't legal.
     ARMMCExpr::VariantKind RefKind;
-    if (ParsePrefix(RefKind))
+    if (parsePrefix(RefKind))
       return true;
 
     const MCExpr *SubExprVal;
@@ -1846,9 +3467,9 @@ bool ARMAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   }
 }
 
-// ParsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
+// parsePrefix - Parse ARM 16-bit relocations expression prefix, i.e.
 //  :lower16: and :upper16:.
-bool ARMAsmParser::ParsePrefix(ARMMCExpr::VariantKind &RefKind) {
+bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   RefKind = ARMMCExpr::VK_ARM_None;
 
   // :lower16: and :upper16: modifiers
@@ -1879,55 +3500,16 @@ bool ARMAsmParser::ParsePrefix(ARMMCExpr::VariantKind &RefKind) {
   return false;
 }
 
-const MCExpr *
-ARMAsmParser::ApplyPrefixToExpr(const MCExpr *E,
-                                MCSymbolRefExpr::VariantKind Variant) {
-  // Recurse over the given expression, rebuilding it to apply the given variant
-  // to the leftmost symbol.
-  if (Variant == MCSymbolRefExpr::VK_None)
-    return E;
-
-  switch (E->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle target expr yet");
-  case MCExpr::Constant:
-    llvm_unreachable("Can't handle lower16/upper16 of constant yet");
-
-  case MCExpr::SymbolRef: {
-    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
-
-    if (SRE->getKind() != MCSymbolRefExpr::VK_None)
-      return 0;
-
-    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, getContext());
-  }
-
-  case MCExpr::Unary:
-    llvm_unreachable("Can't handle unary expressions yet");
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
-    const MCExpr *LHS = ApplyPrefixToExpr(BE->getLHS(), Variant);
-    const MCExpr *RHS = BE->getRHS();
-    if (!LHS)
-      return 0;
-
-    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext());
-  }
-  }
-
-  assert(0 && "Invalid expression kind!");
-  return 0;
-}
-
 /// \brief Given a mnemonic, split out possible predication code and carry
 /// setting letters to form a canonical mnemonic and flags.
 //
 // FIXME: Would be nice to autogen this.
-static StringRef SplitMnemonic(StringRef Mnemonic,
-                               unsigned &PredicationCode,
-                               bool &CarrySetting,
-                               unsigned &ProcessorIMod) {
+// FIXME: This is a bit of a maze of special cases.
+StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
+                                      unsigned &PredicationCode,
+                                      bool &CarrySetting,
+                                      unsigned &ProcessorIMod,
+                                      StringRef &ITMask) {
   PredicationCode = ARMCC::AL;
   CarrySetting = false;
   ProcessorIMod = 0;
@@ -1935,23 +3517,22 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
   // Ignore some mnemonics we know aren't predicated forms.
   //
   // FIXME: Would be nice to autogen this.
-  if (Mnemonic == "teq" || Mnemonic == "vceq" ||
-      Mnemonic == "movs" ||
-      Mnemonic == "svc" ||
-      (Mnemonic == "mls" || Mnemonic == "smmls" || Mnemonic == "vcls" ||
-       Mnemonic == "vmls" || Mnemonic == "vnmls") ||
-      Mnemonic == "vacge" || Mnemonic == "vcge" ||
-      Mnemonic == "vclt" ||
-      Mnemonic == "vacgt" || Mnemonic == "vcgt" ||
-      Mnemonic == "vcle" ||
-      (Mnemonic == "smlal" || Mnemonic == "umaal" || Mnemonic == "umlal" ||
-       Mnemonic == "vabal" || Mnemonic == "vmlal" || Mnemonic == "vpadal" ||
-       Mnemonic == "vqdmlal" || Mnemonic == "bics"))
+  if ((Mnemonic == "movs" && isThumb()) ||
+      Mnemonic == "teq"   || Mnemonic == "vceq"   || Mnemonic == "svc"   ||
+      Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
+      Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
+      Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
+      Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
+      Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
+      Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal")
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
   // predicated but do have a carry-set and so weren't caught above.
-  if (Mnemonic != "adcs") {
+  if (Mnemonic != "adcs" && Mnemonic != "bics" && Mnemonic != "movs" &&
+      Mnemonic != "muls" && Mnemonic != "smlals" && Mnemonic != "smulls" &&
+      Mnemonic != "umlals" && Mnemonic != "umulls" && Mnemonic != "lsls" &&
+      Mnemonic != "sbcs" && Mnemonic != "rscs") {
     unsigned CC = StringSwitch<unsigned>(Mnemonic.substr(Mnemonic.size()-2))
       .Case("eq", ARMCC::EQ)
       .Case("ne", ARMCC::NE)
@@ -1980,11 +3561,12 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
   // Next, determine if we have a carry setting bit. We explicitly ignore all
   // the instructions we know end in 's'.
   if (Mnemonic.endswith("s") &&
-      !(Mnemonic == "asrs" || Mnemonic == "cps" || Mnemonic == "mls" ||
-        Mnemonic == "movs" || Mnemonic == "mrs" || Mnemonic == "smmls" ||
-        Mnemonic == "vabs" || Mnemonic == "vcls" || Mnemonic == "vmls" ||
-        Mnemonic == "vmrs" || Mnemonic == "vnmls" || Mnemonic == "vqabs" ||
-        Mnemonic == "vrecps" || Mnemonic == "vrsqrts")) {
+      !(Mnemonic == "cps" || Mnemonic == "mls" ||
+        Mnemonic == "mrs" || Mnemonic == "smmls" || Mnemonic == "vabs" ||
+        Mnemonic == "vcls" || Mnemonic == "vmls" || Mnemonic == "vmrs" ||
+        Mnemonic == "vnmls" || Mnemonic == "vqabs" || Mnemonic == "vrecps" ||
+        Mnemonic == "vrsqrts" || Mnemonic == "srs" ||
+        (Mnemonic == "movs" && isThumb()))) {
     Mnemonic = Mnemonic.slice(0, Mnemonic.size() - 1);
     CarrySetting = true;
   }
@@ -2004,6 +3586,12 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
     }
   }
 
+  // The "it" instruction has the condition mask on the end of the mnemonic.
+  if (Mnemonic.startswith("it")) {
+    ITMask = Mnemonic.slice(2, Mnemonic.size());
+    Mnemonic = Mnemonic.slice(0, 2);
+  }
+
   return Mnemonic;
 }
 
@@ -2012,37 +3600,154 @@ static StringRef SplitMnemonic(StringRef Mnemonic,
 //
 // FIXME: It would be nice to autogen this.
 void ARMAsmParser::
-GetMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
                       bool &CanAcceptPredicationCode) {
   if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
       Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
-      Mnemonic == "smull" || Mnemonic == "add" || Mnemonic == "adc" ||
+      Mnemonic == "add" || Mnemonic == "adc" ||
       Mnemonic == "mul" || Mnemonic == "bic" || Mnemonic == "asr" ||
-      Mnemonic == "umlal" || Mnemonic == "orr" || Mnemonic == "mvn" ||
+      Mnemonic == "orr" || Mnemonic == "mvn" ||
       Mnemonic == "rsb" || Mnemonic == "rsc" || Mnemonic == "orn" ||
-      Mnemonic == "sbc" || Mnemonic == "mla" || Mnemonic == "umull" ||
-      Mnemonic == "eor" || Mnemonic == "smlal" ||
-      (Mnemonic == "mov" && !isThumbOne())) {
+      Mnemonic == "sbc" || Mnemonic == "eor" || Mnemonic == "neg" ||
+      (!isThumb() && (Mnemonic == "smull" || Mnemonic == "mov" ||
+                      Mnemonic == "mla" || Mnemonic == "smlal" ||
+                      Mnemonic == "umlal" || Mnemonic == "umull"))) {
     CanAcceptCarrySet = true;
-  } else {
+  } else
     CanAcceptCarrySet = false;
-  }
 
   if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" ||
       Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" ||
       Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" ||
       Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" ||
-      Mnemonic == "dsb" || Mnemonic == "movs" || Mnemonic == "isb" ||
-      Mnemonic == "clrex" || Mnemonic.startswith("cps")) {
+      Mnemonic == "dsb" || Mnemonic == "isb" || Mnemonic == "setend" ||
+      (Mnemonic == "clrex" && !isThumb()) ||
+      (Mnemonic == "nop" && isThumbOne()) ||
+      ((Mnemonic == "pld" || Mnemonic == "pli" || Mnemonic == "pldw" ||
+        Mnemonic == "ldc2" || Mnemonic == "ldc2l" ||
+        Mnemonic == "stc2" || Mnemonic == "stc2l") && !isThumb()) ||
+      ((Mnemonic.startswith("rfe") || Mnemonic.startswith("srs")) &&
+       !isThumb()) ||
+      Mnemonic.startswith("cps") || (Mnemonic == "movs" && isThumbOne())) {
     CanAcceptPredicationCode = false;
-  } else {
+  } else
     CanAcceptPredicationCode = true;
-  }
 
-  if (isThumb())
+  if (isThumb()) {
     if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
         Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
       CanAcceptPredicationCode = false;
+  }
+}
+
+bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // FIXME: This is all horribly hacky. We really need a better way to deal
+  // with optional operands like this in the matcher table.
+
+  // The 'mov' mnemonic is special. One variant has a cc_out operand, while
+  // another does not. Specifically, the MOVW instruction does not. So we
+  // special case it here and remove the defaulted (non-setting) cc_out
+  // operand if that's the instruction we're trying to match.
+  //
+  // We do this as post-processing of the explicit operands rather than just
+  // conditionally adding the cc_out in the first place because we need
+  // to check the type of the parsed immediate operand.
+  if (Mnemonic == "mov" && Operands.size() > 4 && !isThumb() &&
+      !static_cast<ARMOperand*>(Operands[4])->isARMSOImm() &&
+      static_cast<ARMOperand*>(Operands[4])->isImm0_65535Expr() &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when there are only two register operands.
+  if (isThumb() && Mnemonic == "add" && Operands.size() == 5 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+  // Register-register 'add' for thumb does not have a cc_out operand
+  // when it's an ADD Rdm, SP, {Rdm|#imm0_255} instruction. We do
+  // have to check the immediate range here since Thumb2 has a variant
+  // that can handle a different range and has a cc_out operand.
+  if (((isThumb() && Mnemonic == "add") ||
+       (isThumbTwo() && Mnemonic == "sub")) &&
+      Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::SP &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      (static_cast<ARMOperand*>(Operands[5])->isReg() ||
+       static_cast<ARMOperand*>(Operands[5])->isImm0_1020s4()))
+    return true;
+  // For Thumb2, add/sub immediate does not have a cc_out operand for the
+  // imm0_4095 variant. That's the least-preferred variant when
+  // selecting via the generic "add" mnemonic, so to know that we
+  // should remove the cc_out operand, we have to explicitly check that
+  // it's not one of the other variants. Ugh.
+  if (isThumbTwo() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    // Nest conditions rather than one big 'if' statement for readability.
+    //
+    // If either register is a high reg, it's either one of the SP
+    // variants (handled above) or a 32-bit encoding, so we just
+    // check against T3.
+    if ((!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+         !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg())) &&
+        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
+      return false;
+    // If both registers are low, we're in an IT block, and the immediate is
+    // in range, we should use encoding T1 instead, which has a cc_out.
+    if (inITBlock() &&
+        isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) &&
+        isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) &&
+        static_cast<ARMOperand*>(Operands[5])->isImm0_7())
+      return false;
+
+    // Otherwise, we use encoding T4, which does not have a cc_out
+    // operand.
+    return true;
+  }
+
+  // The thumb2 multiply instruction doesn't have a CCOut register, so
+  // if we have a "mul" mnemonic in Thumb mode, check if we'll be able to
+  // use the 16-bit encoding or not.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[5])->isReg() &&
+      // If the registers aren't low regs, the destination reg isn't the
+      // same as one of the source regs, or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+       !inITBlock() ||
+       (static_cast<ARMOperand*>(Operands[3])->getReg() !=
+        static_cast<ARMOperand*>(Operands[5])->getReg() &&
+        static_cast<ARMOperand*>(Operands[3])->getReg() !=
+        static_cast<ARMOperand*>(Operands[4])->getReg())))
+    return true;
+
+
+
+  // Register-register 'add/sub' for thumb does not have a cc_out operand
+  // when it's an ADD/SUB SP, #imm. Be lenient on count since there's also
+  // the "add/sub SP, SP, #imm" version. If the follow-up operands aren't
+  // right, this will result in better diagnostics (which operand is off)
+  // anyway.
+  if (isThumb() && (Mnemonic == "add" || Mnemonic == "sub") &&
+      (Operands.size() == 5 || Operands.size() == 6) &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::SP &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0)
+    return true;
+
+  return false;
 }
 
 /// Parse an arm instruction mnemonic followed by its operands.
@@ -2050,16 +3755,51 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create the leading tokens for the mnemonic, split by '.' characters.
   size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
+  StringRef Mnemonic = Name.slice(Start, Next);
 
   // Split out the predication code and carry setting flag from the mnemonic.
   unsigned PredicationCode;
   unsigned ProcessorIMod;
   bool CarrySetting;
-  Head = SplitMnemonic(Head, PredicationCode, CarrySetting,
-                       ProcessorIMod);
+  StringRef ITMask;
+  Mnemonic = splitMnemonic(Mnemonic, PredicationCode, CarrySetting,
+                           ProcessorIMod, ITMask);
+
+  // In Thumb1, only the branch (B) instruction can be predicated.
+  if (isThumbOne() && PredicationCode != ARMCC::AL && Mnemonic != "b") {
+    Parser.EatToEndOfStatement();
+    return Error(NameLoc, "conditional execution not supported in Thumb1");
+  }
+
+  Operands.push_back(ARMOperand::CreateToken(Mnemonic, NameLoc));
 
-  Operands.push_back(ARMOperand::CreateToken(Head, NameLoc));
+  // Handle the IT instruction ITMask. Convert it to a bitmask. This
+  // is the mask as it will be for the IT encoding if the conditional
+  // encoding has a '1' as it's bit0 (i.e. 't' ==> '1'). In the case
+  // where the conditional bit0 is zero, the instruction post-processing
+  // will adjust the mask accordingly.
+  if (Mnemonic == "it") {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + 2);
+    if (ITMask.size() > 3) {
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "too many conditions on IT instruction");
+    }
+    unsigned Mask = 8;
+    for (unsigned i = ITMask.size(); i != 0; --i) {
+      char pos = ITMask[i - 1];
+      if (pos != 't' && pos != 'e') {
+        Parser.EatToEndOfStatement();
+        return Error(Loc, "illegal IT block condition mask '" + ITMask + "'");
+      }
+      Mask >>= 1;
+      if (ITMask[i - 1] == 't')
+        Mask |= 8;
+    }
+    Operands.push_back(ARMOperand::CreateITMask(Mask, Loc));
+  }
+
+  // FIXME: This is all a pretty gross hack. We should automatically handle
+  // optional operands like this via tblgen.
 
   // Next, add the CCOut and ConditionCode operands, if needed.
   //
@@ -2069,34 +3809,36 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
   // the matcher deal with finding the right instruction or generating an
   // appropriate error.
   bool CanAcceptCarrySet, CanAcceptPredicationCode;
-  GetMnemonicAcceptInfo(Head, CanAcceptCarrySet, CanAcceptPredicationCode);
+  getMnemonicAcceptInfo(Mnemonic, CanAcceptCarrySet, CanAcceptPredicationCode);
 
   // If we had a carry-set on an instruction that can't do that, issue an
   // error.
   if (!CanAcceptCarrySet && CarrySetting) {
     Parser.EatToEndOfStatement();
-    return Error(NameLoc, "instruction '" + Head +
+    return Error(NameLoc, "instruction '" + Mnemonic +
                  "' can not set flags, but 's' suffix specified");
   }
+  // If we had a predication code on an instruction that can't do that, issue an
+  // error.
+  if (!CanAcceptPredicationCode && PredicationCode != ARMCC::AL) {
+    Parser.EatToEndOfStatement();
+    return Error(NameLoc, "instruction '" + Mnemonic +
+                 "' is not predicable, but condition code specified");
+  }
 
   // Add the carry setting operand, if necessary.
-  //
-  // FIXME: It would be awesome if we could somehow invent a location such that
-  // match errors on this operand would print a nice diagnostic about how the
-  // 's' character in the mnemonic resulted in a CCOut operand.
-  if (CanAcceptCarrySet)
+  if (CanAcceptCarrySet) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
     Operands.push_back(ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0,
-                                               NameLoc));
+                                               Loc));
+  }
 
   // Add the predication code operand, if necessary.
   if (CanAcceptPredicationCode) {
+    SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size() +
+                                      CarrySetting);
     Operands.push_back(ARMOperand::CreateCondCode(
-                         ARMCC::CondCodes(PredicationCode), NameLoc));
-  } else {
-    // This mnemonic can't ever accept a predication code, but the user wrote
-    // one (or misspelled another mnemonic).
-
-    // FIXME: Issue a nice error.
+                         ARMCC::CondCodes(PredicationCode), Loc));
   }
 
   // Add the processor imod operand, if necessary.
@@ -2104,11 +3846,6 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
     Operands.push_back(ARMOperand::CreateImm(
           MCConstantExpr::Create(ProcessorIMod, getContext()),
                                  NameLoc, NameLoc));
-  } else {
-    // This mnemonic can't ever accept a imod, but the user wrote
-    // one (or misspelled another mnemonic).
-
-    // FIXME: Issue a nice error.
   }
 
   // Add the remaining tokens in the mnemonic.
@@ -2117,13 +3854,19 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
     Next = Name.find('.', Start + 1);
     StringRef ExtraToken = Name.slice(Start, Next);
 
-    Operands.push_back(ARMOperand::CreateToken(ExtraToken, NameLoc));
+    // For now, we're only parsing Thumb1 (for the most part), so
+    // just ignore ".n" qualifiers. We'll use them to restrict
+    // matching when we do Thumb2.
+    if (ExtraToken != ".n") {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
+    }
   }
 
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Head)) {
+    if (parseOperand(Operands, Mnemonic)) {
       Parser.EatToEndOfStatement();
       return true;
     }
@@ -2132,7 +3875,7 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
       Parser.Lex();  // Eat the comma.
 
       // Parse and remember the operand.
-      if (ParseOperand(Operands, Head)) {
+      if (parseOperand(Operands, Mnemonic)) {
         Parser.EatToEndOfStatement();
         return true;
       }
@@ -2140,75 +3883,548 @@ bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
     Parser.EatToEndOfStatement();
-    return TokError("unexpected token in argument list");
+    return Error(Loc, "unexpected token in argument list");
   }
 
   Parser.Lex(); // Consume the EndOfStatement
+
+  // Some instructions, mostly Thumb, have forms for the same mnemonic that
+  // do and don't have a cc_out optional-def operand. With some spot-checks
+  // of the operand list, we can figure out which variant we're trying to
+  // parse and adjust accordingly before actually matching. We shouldn't ever
+  // try to remove a cc_out operand that was explicitly set on the the
+  // mnemonic, of course (CarrySetting == true). Reason number #317 the
+  // table driven matcher doesn't fit well with the ARM instruction set.
+  if (!CarrySetting && shouldOmitCCOutOperand(Mnemonic, Operands)) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
+  // ARM mode 'blx' need special handling, as the register operand version
+  // is predicable, but the label operand version is not. So, we can't rely
+  // on the Mnemonic based checking to correctly figure out when to put
+  // a k_CondCode operand in the list. If we're trying to match the label
+  // version, remove the k_CondCode operand here.
+  if (!isThumb() && Mnemonic == "blx" && Operands.size() == 3 &&
+      static_cast<ARMOperand*>(Operands[2])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
+  // The vector-compare-to-zero instructions have a literal token "#0" at
+  // the end that comes to here as an immediate operand. Convert it to a
+  // token to play nicely with the matcher.
+  if ((Mnemonic == "vceq" || Mnemonic == "vcge" || Mnemonic == "vcgt" ||
+      Mnemonic == "vcle" || Mnemonic == "vclt") && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE && CE->getValue() == 0) {
+      Operands.erase(Operands.begin() + 5);
+      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
+      delete Op;
+    }
+  }
+  // VCMP{E} does the same thing, but with a different operand count.
+  if ((Mnemonic == "vcmp" || Mnemonic == "vcmpe") && Operands.size() == 5 &&
+      static_cast<ARMOperand*>(Operands[4])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[4]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE && CE->getValue() == 0) {
+      Operands.erase(Operands.begin() + 4);
+      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
+      delete Op;
+    }
+  }
+  // Similarly, the Thumb1 "RSB" instruction has a literal "#0" on the
+  // end. Convert it to a token here.
+  if (Mnemonic == "rsb" && isThumb() && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[5]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE && CE->getValue() == 0) {
+      Operands.erase(Operands.begin() + 5);
+      Operands.push_back(ARMOperand::CreateToken("#0", Op->getStartLoc()));
+      delete Op;
+    }
+  }
+
+  return false;
+}
+
+// Validate context-sensitive operand constraints.
+
+// return 'true' if register list contains non-low GPR registers,
+// 'false' otherwise. If Reg is in the register list or is HiReg, set
+// 'containsReg' to true.
+static bool checkLowRegisterList(MCInst Inst, unsigned OpNo, unsigned Reg,
+                                 unsigned HiReg, bool &containsReg) {
+  containsReg = false;
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      containsReg = true;
+    // Anything other than a low register isn't legal here.
+    if (!isARMLowRegister(OpReg) && (!HiReg || OpReg != HiReg))
+      return true;
+  }
+  return false;
+}
+
+// Check if the specified regisgter is in the register list of the inst,
+// starting at the indicated operand number.
+static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
+  for (unsigned i = OpNo; i < Inst.getNumOperands(); ++i) {
+    unsigned OpReg = Inst.getOperand(i).getReg();
+    if (OpReg == Reg)
+      return true;
+  }
+  return false;
+}
+
+// FIXME: We would really prefer to have MCInstrInfo (the wrapper around
+// the ARMInsts array) instead. Getting that here requires awkward
+// API changes, though. Better way?
+namespace llvm {
+extern MCInstrDesc ARMInsts[];
+}
+static MCInstrDesc &getInstDesc(unsigned Opcode) {
+  return ARMInsts[Opcode];
+}
+
+// FIXME: We would really like to be able to tablegen'erate this.
+bool ARMAsmParser::
+validateInstruction(MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  SMLoc Loc = Operands[0]->getStartLoc();
+  // Check the IT block state first.
+  // NOTE: In Thumb mode, the BKPT instruction has the interesting property of
+  // being allowed in IT blocks, but not being predicable.  It just always
+  // executes.
+  if (inITBlock() && Inst.getOpcode() != ARM::tBKPT) {
+    unsigned bit = 1;
+    if (ITState.FirstCond)
+      ITState.FirstCond = false;
+    else
+      bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+    // The instruction must be predicable.
+    if (!MCID.isPredicable())
+      return Error(Loc, "instructions in IT block must be predicable");
+    unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
+    unsigned ITCond = bit ? ITState.Cond :
+      ARMCC::getOppositeCondition(ITState.Cond);
+    if (Cond != ITCond) {
+      // Find the condition code Operand to get its SMLoc information.
+      SMLoc CondLoc;
+      for (unsigned i = 1; i < Operands.size(); ++i)
+        if (static_cast<ARMOperand*>(Operands[i])->isCondCode())
+          CondLoc = Operands[i]->getStartLoc();
+      return Error(CondLoc, "incorrect condition in IT block; got '" +
+                   StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
+                   "', but expected '" +
+                   ARMCondCodeToString(ARMCC::CondCodes(ITCond)) + "'");
+    }
+  // Check for non-'al' condition codes outside of the IT block.
+  } else if (isThumbTwo() && MCID.isPredicable() &&
+             Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
+             ARMCC::AL && Inst.getOpcode() != ARM::tB &&
+             Inst.getOpcode() != ARM::t2B)
+    return Error(Loc, "predicated instructions must be in IT block");
+
+  switch (Inst.getOpcode()) {
+  case ARM::LDRD:
+  case ARM::LDRD_PRE:
+  case ARM::LDRD_POST:
+  case ARM::LDREXD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg());
+    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands must be sequential");
+    return false;
+  }
+  case ARM::STRD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(0).getReg());
+    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::STRD_PRE:
+  case ARM::STRD_POST:
+  case ARM::STREXD: {
+    // Rt2 must be Rt + 1.
+    unsigned Rt = getARMRegisterNumbering(Inst.getOperand(1).getReg());
+    unsigned Rt2 = getARMRegisterNumbering(Inst.getOperand(2).getReg());
+    if (Rt2 != Rt + 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "source operands must be sequential");
+    return false;
+  }
+  case ARM::SBFX:
+  case ARM::UBFX: {
+    // width must be in range [1, 32-lsb]
+    unsigned lsb = Inst.getOperand(2).getImm();
+    unsigned widthm1 = Inst.getOperand(3).getImm();
+    if (widthm1 >= 32 - lsb)
+      return Error(Operands[5]->getStartLoc(),
+                   "bitfield width must be in range [1,32-lsb]");
+    return false;
+  }
+  case ARM::tLDMIA: {
+    // If we're parsing Thumb2, the .w variant is available and handles
+    // most cases that are normally illegal for a Thumb1 LDM
+    // instruction. We'll make the transformation in processInstruction()
+    // if necessary.
+    //
+    // Thumb LDM instructions are writeback iff the base register is not
+    // in the register list.
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + hasWritebackToken]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    // If we should have writeback, then there should be a '!' token.
+    if (!listContainsBase && !hasWritebackToken && !isThumbTwo())
+      return Error(Operands[2]->getStartLoc(),
+                   "writeback operator '!' expected");
+    // If we should not have writeback, there must not be a '!'. This is
+    // true even for the 32-bit wide encodings.
+    if (listContainsBase && hasWritebackToken)
+      return Error(Operands[3]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+
+    break;
+  }
+  case ARM::t2LDMIA_UPD: {
+    if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
+    break;
+  }
+  case ARM::tPOP: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, 0, ARM::PC, listContainsBase))
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or pc");
+    break;
+  }
+  case ARM::tPUSH: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, 0, ARM::LR, listContainsBase))
+      return Error(Operands[2]->getStartLoc(),
+                   "registers must be in range r0-r7 or lr");
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, 0, 0, listContainsBase) && !isThumbTwo())
+      return Error(Operands[4]->getStartLoc(),
+                   "registers must be in range r0-r7");
+    break;
+  }
+  }
+
   return false;
 }
 
+void ARMAsmParser::
+processInstruction(MCInst &Inst,
+                   const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  switch (Inst.getOpcode()) {
+  case ARM::LDMIA_UPD:
+    // If this is a load of a single register via a 'pop', then we should use
+    // a post-indexed LDR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "pop" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::LDR_POST_IMM);
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(1)); // Rn
+      TmpInst.addOperand(MCOperand::CreateReg(0));  // am2offset
+      TmpInst.addOperand(MCOperand::CreateImm(4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  case ARM::STMDB_UPD:
+    // If this is a store of a single register via a 'push', then we should use
+    // a pre-indexed STR instruction instead, per the ARM ARM.
+    if (static_cast<ARMOperand*>(Operands[0])->getToken() == "push" &&
+        Inst.getNumOperands() == 5) {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::STR_PRE_IMM);
+      TmpInst.addOperand(Inst.getOperand(0)); // Rn_wb
+      TmpInst.addOperand(Inst.getOperand(4)); // Rt
+      TmpInst.addOperand(Inst.getOperand(1)); // addrmode_imm12
+      TmpInst.addOperand(MCOperand::CreateImm(-4));
+      TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  case ARM::tADDi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6)
+      Inst.setOpcode(ARM::tADDi3);
+    break;
+  case ARM::tSUBi8:
+    // If the immediate is in the range 0-7, we want tADDi3 iff Rd was
+    // explicitly specified. From the ARM ARM: "Encoding T1 is preferred
+    // to encoding T2 if <Rd> is specified and encoding T2 is preferred
+    // to encoding T1 if <Rd> is omitted."
+    if (Inst.getOperand(3).getImm() < 8 && Operands.size() == 6)
+      Inst.setOpcode(ARM::tSUBi3);
+    break;
+  case ARM::tB:
+    // A Thumb conditional branch outside of an IT block is a tBcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock())
+      Inst.setOpcode(ARM::tBcc);
+    break;
+  case ARM::t2B:
+    // A Thumb2 conditional branch outside of an IT block is a t2Bcc.
+    if (Inst.getOperand(1).getImm() != ARMCC::AL && !inITBlock())
+      Inst.setOpcode(ARM::t2Bcc);
+    break;
+  case ARM::t2Bcc:
+    // If the conditional is AL or we're in an IT block, we really want t2B.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL || inITBlock())
+      Inst.setOpcode(ARM::t2B);
+    break;
+  case ARM::tBcc:
+    // If the conditional is AL, we really want tB.
+    if (Inst.getOperand(1).getImm() == ARMCC::AL)
+      Inst.setOpcode(ARM::tB);
+    break;
+  case ARM::tLDMIA: {
+    // If the register list contains any high registers, or if the writeback
+    // doesn't match what tLDMIA can do, we need to use the 32-bit encoding
+    // instead if we're in Thumb2. Otherwise, this should have generated
+    // an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool hasWritebackToken =
+      (static_cast<ARMOperand*>(Operands[3])->isToken() &&
+       static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) ||
+        (!listContainsBase && !hasWritebackToken) ||
+        (listContainsBase && hasWritebackToken)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(hasWritebackToken ? ARM::t2LDMIA_UPD : ARM::t2LDMIA);
+      // If we're switching to the updating version, we need to insert
+      // the writeback tied operand.
+      if (hasWritebackToken)
+        Inst.insert(Inst.begin(),
+                    MCOperand::CreateReg(Inst.getOperand(0).getReg()));
+    }
+    break;
+  }
+  case ARM::tSTMIA_UPD: {
+    // If the register list contains any high registers, we need to use
+    // the 32-bit encoding instead if we're in Thumb2. Otherwise, this
+    // should have generated an error in validateInstruction().
+    unsigned Rn = Inst.getOperand(0).getReg();
+    bool listContainsBase;
+    if (checkLowRegisterList(Inst, 4, Rn, 0, listContainsBase)) {
+      // 16-bit encoding isn't sufficient. Switch to the 32-bit version.
+      assert (isThumbTwo());
+      Inst.setOpcode(ARM::t2STMIA_UPD);
+    }
+    break;
+  }
+  case ARM::t2MOVi: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        Inst.getOperand(1).getImm() <= 255 &&
+        ((!inITBlock() && Inst.getOperand(2).getImm() == ARMCC::AL &&
+         Inst.getOperand(4).getReg() == ARM::CPSR) ||
+        (inITBlock() && Inst.getOperand(4).getReg() == 0)) &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      // The operands aren't in the same order for tMOVi8...
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::tMOVi8);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(4));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  }
+  case ARM::t2MOVr: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == ARMCC::AL &&
+        Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      // The operands aren't the same for tMOV[S]r... (no cc_out)
+      MCInst TmpInst;
+      TmpInst.setOpcode(Inst.getOperand(4).getReg() ? ARM::tMOVSr : ARM::tMOVr);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(2));
+      TmpInst.addOperand(Inst.getOperand(3));
+      Inst = TmpInst;
+    }
+    break;
+  }
+  case ARM::t2SXTH:
+  case ARM::t2SXTB:
+  case ARM::t2UXTH:
+  case ARM::t2UXTB: {
+    // If we can use the 16-bit encoding and the user didn't explicitly
+    // request the 32-bit variant, transform it here.
+    if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
+        Inst.getOperand(2).getImm() == 0 &&
+        (!static_cast<ARMOperand*>(Operands[2])->isToken() ||
+         static_cast<ARMOperand*>(Operands[2])->getToken() != ".w")) {
+      unsigned NewOpc;
+      switch (Inst.getOpcode()) {
+      default: llvm_unreachable("Illegal opcode!");
+      case ARM::t2SXTH: NewOpc = ARM::tSXTH; break;
+      case ARM::t2SXTB: NewOpc = ARM::tSXTB; break;
+      case ARM::t2UXTH: NewOpc = ARM::tUXTH; break;
+      case ARM::t2UXTB: NewOpc = ARM::tUXTB; break;
+      }
+      // The operands aren't the same for thumb1 (no rotate operand).
+      MCInst TmpInst;
+      TmpInst.setOpcode(NewOpc);
+      TmpInst.addOperand(Inst.getOperand(0));
+      TmpInst.addOperand(Inst.getOperand(1));
+      TmpInst.addOperand(Inst.getOperand(3));
+      TmpInst.addOperand(Inst.getOperand(4));
+      Inst = TmpInst;
+    }
+    break;
+  }
+  case ARM::t2IT: {
+    // The mask bits for all but the first condition are represented as
+    // the low bit of the condition code value implies 't'. We currently
+    // always have 1 implies 't', so XOR toggle the bits if the low bit
+    // of the condition code is zero. The encoding also expects the low
+    // bit of the condition to be encoded as bit 4 of the mask operand,
+    // so mask that in if needed
+    MCOperand &MO = Inst.getOperand(1);
+    unsigned Mask = MO.getImm();
+    unsigned OrigMask = Mask;
+    unsigned TZ = CountTrailingZeros_32(Mask);
+    if ((Inst.getOperand(0).getImm() & 1) == 0) {
+      assert(Mask && TZ <= 3 && "illegal IT mask value!");
+      for (unsigned i = 3; i != TZ; --i)
+        Mask ^= 1 << i;
+    } else
+      Mask |= 0x10;
+    MO.setImm(Mask);
+
+    // Set up the IT block state according to the IT instruction we just
+    // matched.
+    assert(!inITBlock() && "nested IT blocks?!");
+    ITState.Cond = ARMCC::CondCodes(Inst.getOperand(0).getImm());
+    ITState.Mask = OrigMask; // Use the original mask, not the updated one.
+    ITState.CurPosition = 0;
+    ITState.FirstCond = true;
+    break;
+  }
+  }
+}
+
+unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
+  // 16-bit thumb arithmetic instructions either require or preclude the 'S'
+  // suffix depending on whether they're in an IT block or not.
+  unsigned Opc = Inst.getOpcode();
+  MCInstrDesc &MCID = getInstDesc(Opc);
+  if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
+    assert(MCID.hasOptionalDef() &&
+           "optionally flag setting instruction missing optional def operand");
+    assert(MCID.NumOperands == Inst.getNumOperands() &&
+           "operand count mismatch!");
+    // Find the optional-def operand (cc_out).
+    unsigned OpNo;
+    for (OpNo = 0;
+         !MCID.OpInfo[OpNo].isOptionalDef() && OpNo < MCID.NumOperands;
+         ++OpNo)
+      ;
+    // If we're parsing Thumb1, reject it completely.
+    if (isThumbOne() && Inst.getOperand(OpNo).getReg() != ARM::CPSR)
+      return Match_MnemonicFail;
+    // If we're parsing Thumb2, which form is legal depends on whether we're
+    // in an IT block.
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() != ARM::CPSR &&
+        !inITBlock())
+      return Match_RequiresITBlock;
+    if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
+        inITBlock())
+      return Match_RequiresNotITBlock;
+  }
+  // Some high-register supporting Thumb1 encodings only allow both registers
+  // to be from r0-r7 when in Thumb2.
+  else if (Opc == ARM::tADDhirr && isThumbOne() &&
+           isARMLowRegister(Inst.getOperand(1).getReg()) &&
+           isARMLowRegister(Inst.getOperand(2).getReg()))
+    return Match_RequiresThumb2;
+  // Others only require ARMv6 or later.
+  else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() &&
+           isARMLowRegister(Inst.getOperand(0).getReg()) &&
+           isARMLowRegister(Inst.getOperand(1).getReg()))
+    return Match_RequiresV6;
+  return Match_Success;
+}
+
 bool ARMAsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
   MCInst Inst;
   unsigned ErrorInfo;
-  MatchResultTy MatchResult, MatchResult2;
+  unsigned MatchResult;
   MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo);
-  if (MatchResult != Match_Success) {
-    // If we get a Match_InvalidOperand it might be some arithmetic instruction
-    // that does not update the condition codes.  So try adding a CCOut operand
-    // with a value of reg0.
-    if (MatchResult == Match_InvalidOperand) {
-      Operands.insert(Operands.begin() + 1,
-                      ARMOperand::CreateCCOut(0,
-                                  ((ARMOperand*)Operands[0])->getStartLoc()));
-      MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo);
-      if (MatchResult2 == Match_Success)
-        MatchResult = Match_Success;
-      else {
-        ARMOperand *CCOut = ((ARMOperand*)Operands[1]);
-        Operands.erase(Operands.begin() + 1);
-        delete CCOut;
-      }
-    }
-    // If we get a Match_MnemonicFail it might be some arithmetic instruction
-    // that updates the condition codes if it ends in 's'.  So see if the
-    // mnemonic ends in 's' and if so try removing the 's' and adding a CCOut
-    // operand with a value of CPSR.
-    else if (MatchResult == Match_MnemonicFail) {
-      // Get the instruction mnemonic, which is the first token.
-      StringRef Mnemonic = ((ARMOperand*)Operands[0])->getToken();
-      if (Mnemonic.substr(Mnemonic.size()-1) == "s") {
-        // removed the 's' from the mnemonic for matching.
-        StringRef MnemonicNoS = Mnemonic.slice(0, Mnemonic.size() - 1);
-        SMLoc NameLoc = ((ARMOperand*)Operands[0])->getStartLoc();
-        ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]);
-        Operands.erase(Operands.begin());
-        delete OldMnemonic;
-        Operands.insert(Operands.begin(),
-                        ARMOperand::CreateToken(MnemonicNoS, NameLoc));
-        Operands.insert(Operands.begin() + 1,
-                        ARMOperand::CreateCCOut(ARM::CPSR, NameLoc));
-        MatchResult2 = MatchInstructionImpl(Operands, Inst, ErrorInfo);
-        if (MatchResult2 == Match_Success)
-          MatchResult = Match_Success;
-        else {
-          ARMOperand *OldMnemonic = ((ARMOperand*)Operands[0]);
-          Operands.erase(Operands.begin());
-          delete OldMnemonic;
-          Operands.insert(Operands.begin(),
-                          ARMOperand::CreateToken(Mnemonic, NameLoc));
-          ARMOperand *CCOut = ((ARMOperand*)Operands[1]);
-          Operands.erase(Operands.begin() + 1);
-          delete CCOut;
-        }
-      }
-    }
-  }
   switch (MatchResult) {
+  default: break;
   case Match_Success:
+    // Context sensitive operand constraints aren't handled by the matcher,
+    // so check them here.
+    if (validateInstruction(Inst, Operands)) {
+      // Still progress the IT block, otherwise one wrong condition causes
+      // nasty cascading errors.
+      forwardITPosition();
+      return true;
+    }
+
+    // Some instructions need post-processing to, for example, tweak which
+    // encoding is selected.
+    processInstruction(Inst, Operands);
+
+    // Only move forward at the very end so that everything in validate
+    // and process gets a consistent answer about whether we're in an IT
+    // block.
+    forwardITPosition();
+
     Out.EmitInstruction(Inst);
     return false;
   case Match_MissingFeature:
@@ -2227,34 +4443,43 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(ErrorLoc, "invalid operand for instruction");
   }
   case Match_MnemonicFail:
-    return Error(IDLoc, "unrecognized instruction mnemonic");
+    return Error(IDLoc, "invalid instruction");
   case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction");
+    // The converter function will have already emited a diagnostic.
+    return true;
+  case Match_RequiresNotITBlock:
+    return Error(IDLoc, "flag setting instruction only valid outside IT block");
+  case Match_RequiresITBlock:
+    return Error(IDLoc, "instruction only valid inside IT block");
+  case Match_RequiresV6:
+    return Error(IDLoc, "instruction variant requires ARMv6 or later");
+  case Match_RequiresThumb2:
+    return Error(IDLoc, "instruction variant requires Thumb2");
   }
 
   llvm_unreachable("Implement any new match types added!");
   return true;
 }
 
-/// ParseDirective parses the arm specific directives
+/// parseDirective parses the arm specific directives
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
-    return ParseDirectiveWord(4, DirectiveID.getLoc());
+    return parseDirectiveWord(4, DirectiveID.getLoc());
   else if (IDVal == ".thumb")
-    return ParseDirectiveThumb(DirectiveID.getLoc());
+    return parseDirectiveThumb(DirectiveID.getLoc());
   else if (IDVal == ".thumb_func")
-    return ParseDirectiveThumbFunc(DirectiveID.getLoc());
+    return parseDirectiveThumbFunc(DirectiveID.getLoc());
   else if (IDVal == ".code")
-    return ParseDirectiveCode(DirectiveID.getLoc());
+    return parseDirectiveCode(DirectiveID.getLoc());
   else if (IDVal == ".syntax")
-    return ParseDirectiveSyntax(DirectiveID.getLoc());
+    return parseDirectiveSyntax(DirectiveID.getLoc());
   return true;
 }
 
-/// ParseDirectiveWord
+/// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
-bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+bool ARMAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
@@ -2277,9 +4502,9 @@ bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveThumb
+/// parseDirectiveThumb
 ///  ::= .thumb
-bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
+bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
   Parser.Lex();
@@ -2290,9 +4515,9 @@ bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveThumbFunc
+/// parseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
-bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
+bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
   const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo();
   bool isMachO = MAI.hasSubsectionsViaSymbols();
   StringRef Name;
@@ -2322,9 +4547,9 @@ bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveSyntax
+/// parseDirectiveSyntax
 ///  ::= .syntax unified | divided
-bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
+bool ARMAsmParser::parseDirectiveSyntax(SMLoc L) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier))
     return Error(L, "unexpected token in .syntax directive");
@@ -2345,9 +4570,9 @@ bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
   return false;
 }
 
-/// ParseDirectiveCode
+/// parseDirectiveCode
 ///  ::= .code 16 | 32
-bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
+bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Integer))
     return Error(L, "unexpected token in .code directive");
@@ -2380,8 +4605,8 @@ extern "C" void LLVMInitializeARMAsmLexer();
 
 /// Force static initialization.
 extern "C" void LLVMInitializeARMAsmParser() {
-  RegisterAsmParser<ARMAsmParser> X(TheARMTarget);
-  RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget);
+  RegisterMCAsmParser<ARMAsmParser> X(TheARMTarget);
+  RegisterMCAsmParser<ARMAsmParser> Y(TheThumbTarget);
   LLVMInitializeARMAsmLexer();
 }
 
diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt
index 9ba7c01..3f5ad39 100644
--- a/lib/Target/ARM/AsmParser/CMakeLists.txt
+++ b/lib/Target/ARM/AsmParser/CMakeLists.txt
@@ -5,3 +5,12 @@ add_llvm_library(LLVMARMAsmParser
   ARMAsmParser.cpp
   )
 
+add_dependencies(LLVMARMAsmParser ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMAsmParser
+  LLVMARMDesc
+  LLVMARMInfo
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 21608d0..f045e83 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -1,21 +1,21 @@
 set(LLVM_TARGET_DEFINITIONS ARM.td)
 
-tablegen(ARMGenRegisterInfo.inc -gen-register-info)
-tablegen(ARMGenInstrInfo.inc -gen-instr-info)
-tablegen(ARMGenCodeEmitter.inc -gen-emitter)
-tablegen(ARMGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(ARMGenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
-tablegen(ARMGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(ARMGenDAGISel.inc -gen-dag-isel)
-tablegen(ARMGenFastISel.inc -gen-fast-isel)
-tablegen(ARMGenCallingConv.inc -gen-callingconv)
-tablegen(ARMGenSubtargetInfo.inc -gen-subtarget)
-tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info)
-tablegen(ARMGenDecoderTables.inc -gen-arm-decoder)
+llvm_tablegen(ARMGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(ARMGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(ARMGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(ARMGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+llvm_tablegen(ARMGenMCPseudoLowering.inc -gen-pseudo-lowering)
+llvm_tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(ARMGenAsmMatcher.inc -gen-asm-matcher)
+llvm_tablegen(ARMGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(ARMGenFastISel.inc -gen-fast-isel)
+llvm_tablegen(ARMGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(ARMGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(ARMGenEDInfo.inc -gen-enhanced-disassembly-info)
+llvm_tablegen(ARMGenDisassemblerTables.inc -gen-disassembler)
+add_public_tablegen_target(ARMCommonTableGen)
 
 add_llvm_target(ARMCodeGen
-  ARMAsmBackend.cpp
   ARMAsmPrinter.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
@@ -32,9 +32,6 @@ add_llvm_target(ARMCodeGen
   ARMISelLowering.cpp
   ARMInstrInfo.cpp
   ARMJITInfo.cpp
-  ARMMachObjectWriter.cpp
-  ARMMCCodeEmitter.cpp
-  ARMMCExpr.cpp
   ARMLoadStoreOptimizer.cpp
   ARMMCInstLower.cpp
   ARMRegisterInfo.cpp
@@ -43,9 +40,8 @@ add_llvm_target(ARMCodeGen
   ARMTargetMachine.cpp
   ARMTargetObjectFile.cpp
   MLxExpansionPass.cpp
-  NEONMoveFix.cpp
-  Thumb1InstrInfo.cpp
   Thumb1FrameLowering.cpp
+  Thumb1InstrInfo.cpp
   Thumb1RegisterInfo.cpp
   Thumb2ITBlockPass.cpp
   Thumb2InstrInfo.cpp
@@ -53,6 +49,20 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
+add_llvm_library_dependencies(LLVMARMCodeGen
+  LLVMARMAsmPrinter
+  LLVMARMDesc
+  LLVMARMInfo
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 # workaround for hanging compilation on MSVC10
 if( MSVC_VERSION EQUAL 1600 )
 set_property(
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index bdce2c4..8f2f813 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -6,584 +6,4077 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains code to implement the public interfaces of ARMDisassembler and
-// ThumbDisassembler, both of which are instances of MCDisassembler.
-//
-//===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "arm-disassembler"
 
-#include "ARMDisassembler.h"
-#include "ARMDisassemblerCore.h"
-
-#include "llvm/ADT/OwningPtr.h"
+#include "ARM.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/MC/EDInstInfo.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
-//#define DEBUG(X) do { X; } while (0)
-
-/// ARMGenDecoderTables.inc - ARMDecoderTables.inc is tblgen'ed from
-/// ARMDecoderEmitter.cpp TableGen backend.  It contains:
-///
-/// o Mappings from opcode to ARM/Thumb instruction format
-///
-/// o static uint16_t decodeInstruction(uint32_t insn) - the decoding function
-/// for an ARM instruction.
-///
-/// o static uint16_t decodeThumbInstruction(field_t insn) - the decoding
-/// function for a Thumb instruction.
-///
-#include "ARMGenDecoderTables.inc"
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
 
+namespace {
+/// ARMDisassembler - ARM disassembler for all ARM platforms.
+class ARMDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ARMDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
+  }
+
+  ~ARMDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  /// getEDInfo - See MCDisassembler.
+  EDInstInfo *getEDInfo() const;
+private:
+};
+
+/// ThumbDisassembler - Thumb disassembler for all Thumb platforms.
+class ThumbDisassembler : public MCDisassembler {
+public:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  ThumbDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
+  }
+
+  ~ThumbDisassembler() {
+  }
+
+  /// getInstruction - See MCDisassembler.
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
+
+  /// getEDInfo - See MCDisassembler.
+  EDInstInfo *getEDInfo() const;
+private:
+  mutable std::vector<unsigned> ITBlock;
+  DecodeStatus AddThumbPredicate(MCInst&) const;
+  void UpdateThumbVFPPredicate(MCInst&) const;
+};
+}
+
+static bool Check(DecodeStatus &Out, DecodeStatus In) {
+  switch (In) {
+    case MCDisassembler::Success:
+      // Out stays the same.
+      return true;
+    case MCDisassembler::SoftFail:
+      Out = In;
+      return true;
+    case MCDisassembler::Fail:
+      Out = In;
+      return false;
+  }
+  return false;
+}
+
+
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
+static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRnopcRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst,
+                                                unsigned RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode3Instruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst & Inst,
+                                                  unsigned Insn,
+                                                  uint64_t Adddress,
+                                                  const void *Decoder);
+static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBranchImmInstruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeNEONModImmInstruction(llvm::MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBCCTargetOperand(llvm::MCInst &Inst,unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LDRDPreInstruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2STRDPreInstruction(llvm::MCInst &Inst,unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, unsigned Val,
+                                uint64_t Address, const void *Decoder);
+
+
+
+#include "ARMGenDisassemblerTables.inc"
+#include "ARMGenInstrInfo.inc"
 #include "ARMGenEDInfo.inc"
 
-using namespace llvm;
+static MCDisassembler *createARMDisassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new ARMDisassembler(STI);
+}
+
+static MCDisassembler *createThumbDisassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new ThumbDisassembler(STI);
+}
 
-/// showBitVector - Use the raw_ostream to log a diagnostic message describing
-/// the inidividual bits of the instruction.
-///
-static inline void showBitVector(raw_ostream &os, const uint32_t &insn) {
-  // Split the bit position markers into more than one lines to fit 80 columns.
-  os << " 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11"
-     << " 10  9  8  7  6  5  4  3  2  1  0 \n";
-  os << "---------------------------------------------------------------"
-     << "----------------------------------\n";
-  os << '|';
-  for (unsigned i = 32; i != 0; --i) {
-    if (insn >> (i - 1) & 0x01)
-      os << " 1";
+EDInstInfo *ARMDisassembler::getEDInfo() const {
+  return instInfoARM;
+}
+
+EDInstInfo *ThumbDisassembler::getEDInfo() const {
+  return instInfoARM;
+}
+
+DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                             const MemoryObject &Region,
+                                             uint64_t Address,
+                                             raw_ostream &os,
+                                             raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  assert(!(STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!");
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  // Encoded as a small-endian 32-bit word in the stream.
+  uint32_t insn = (bytes[3] << 24) |
+                  (bytes[2] << 16) |
+                  (bytes[1] <<  8) |
+                  (bytes[0] <<  0);
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus result = decodeARMInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  // VFP and NEON instructions, similarly, are shared between ARM
+  // and Thumb modes.
+  MI.clear();
+  result = decodeVFPInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONDataInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONLoadStoreInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONDupInstruction32(MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    // Add a fake predicate operand, because we share these instruction
+    // definitions with Thumb2 where these instructions are predicable.
+    if (!DecodePredicateOperand(MI, 0xE, Address, this))
+      return MCDisassembler::Fail;
+    return result;
+  }
+
+  MI.clear();
+
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+namespace llvm {
+extern MCInstrDesc ARMInsts[];
+}
+
+/// tryAddingSymbolicOperand - trys to add a symbolic operand in place of the
+/// immediate Value in the MCInst.  The immediate Value has had any PC
+/// adjustment made by the caller.  If the instruction is a branch instruction
+/// then isBranch is true, else false.  If the getOpInfo() function was set as
+/// part of the setupForSymbolicDisassembly() call then that function is called
+/// to get any symbolic information at the Address for this instruction.  If
+/// that returns non-zero then the symbolic information it returns is used to
+/// create an MCExpr and that is added as an operand to the MCInst.  If
+/// getOpInfo() returns zero and isBranch is true then a symbol look up for
+/// Value is done and if a symbol is found an MCExpr is created with that, else
+/// an MCExpr with Value is created.  This function returns true if it adds an
+/// operand to the MCInst and false otherwise.
+static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
+                                     bool isBranch, uint64_t InstSize,
+                                     MCInst &MI, const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
+  if (!getOpInfo)
+    return false;
+
+  struct LLVMOpInfo1 SymbolicOp;
+  SymbolicOp.Value = Value;
+  void *DisInfo = Dis->getDisInfoBlock();
+  if (!getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (isBranch) {
+      LLVMSymbolLookupCallback SymbolLookUp =
+                                            Dis->getLLVMSymbolLookupCallback();
+      if (SymbolLookUp) {
+        uint64_t ReferenceType;
+        ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+        const char *ReferenceName;
+        const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
+                                        &ReferenceName);
+        if (Name) {
+          SymbolicOp.AddSymbol.Name = Name;
+          SymbolicOp.AddSymbol.Present = true;
+          SymbolicOp.Value = 0;
+        }
+        else {
+          SymbolicOp.Value = Value;
+        }
+        if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+          (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
+      }
+      else {
+        return false;
+      }
+    }
+    else {
+      return false;
+    }
+  }
+
+  MCContext *Ctx = Dis->getMCContext();
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
     else
-      os << " 0";
-    os << (i%4 == 1 ? '|' : ':');
-  }
-  os << '\n';
-  // Split the bit position markers into more than one lines to fit 80 columns.
-  os << "---------------------------------------------------------------"
-     << "----------------------------------\n";
-  os << '\n';
-}
-
-/// decodeARMInstruction is a decorator function which tries special cases of
-/// instruction matching before calling the auto-generated decoder function.
-static unsigned decodeARMInstruction(uint32_t &insn) {
-  if (slice(insn, 31, 28) == 15)
-    goto AutoGenedDecoder;
-
-  // Special case processing, if any, goes here....
-
-  // LLVM combines the offset mode of A8.6.197 & A8.6.198 into STRB.
-  // The insufficient encoding information of the combined instruction confuses
-  // the decoder wrt BFC/BFI.  Therefore, we try to recover here.
-  // For BFC, Inst{27-21} = 0b0111110 & Inst{6-0} = 0b0011111.
-  // For BFI, Inst{27-21} = 0b0111110 & Inst{6-4} = 0b001 & Inst{3-0} =! 0b1111.
-  if (slice(insn, 27, 21) == 0x3e && slice(insn, 6, 4) == 1) {
-    if (slice(insn, 3, 0) == 15)
-      return ARM::BFC;
+      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
     else
-      return ARM::BFI;
-  }
-
-  // Ditto for STRBT, which is a super-instruction for A8.6.199 Encodings
-  // A1 & A2.
-  // As a result, the decoder fails to deocode USAT properly.
-  if (slice(insn, 27, 21) == 0x37 && slice(insn, 5, 4) == 1)
-    return ARM::USAT;
-  // As a result, the decoder fails to deocode UQADD16 properly.
-  if (slice(insn, 27, 20) == 0x66 && slice(insn, 7, 4) == 1)
-    return ARM::UQADD16;
-
-  // Ditto for ADDSrs, which is a super-instruction for A8.6.7 & A8.6.8.
-  // As a result, the decoder fails to decode UMULL properly.
-  if (slice(insn, 27, 21) == 0x04 && slice(insn, 7, 4) == 9) {
-    return ARM::UMULL;
-  }
-
-  // Ditto for STR_PRE, which is a super-instruction for A8.6.194 & A8.6.195.
-  // As a result, the decoder fails to decode SBFX properly.
-  if (slice(insn, 27, 21) == 0x3d && slice(insn, 6, 4) == 5)
-    return ARM::SBFX;
-
-  // And STRB_PRE, which is a super-instruction for A8.6.197 & A8.6.198.
-  // As a result, the decoder fails to decode UBFX properly.
-  if (slice(insn, 27, 21) == 0x3f && slice(insn, 6, 4) == 5)
-    return ARM::UBFX;
-
-  // Ditto for STRT, which is a super-instruction for A8.6.210 Encoding A1 & A2.
-  // As a result, the decoder fails to deocode SSAT properly.
-  if (slice(insn, 27, 21) == 0x35 && slice(insn, 5, 4) == 1)
-    return ARM::SSAT;
-
-  // Ditto for RSCrs, which is a super-instruction for A8.6.146 & A8.6.147.
-  // As a result, the decoder fails to decode STRHT/LDRHT/LDRSHT/LDRSBT.
-  if (slice(insn, 27, 24) == 0) {
-    switch (slice(insn, 21, 20)) {
-    case 2:
-      switch (slice(insn, 7, 4)) {
-      case 11:
-        return ARM::STRHT;
-      default:
-        break; // fallthrough
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, *Ctx);
+  }
+
+  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
+    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
+  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
+    MI.addOperand(MCOperand::CreateExpr(Expr));
+  else 
+    assert(0 && "bad SymbolicOp.VariantKind");
+
+  return true;
+}
+
+/// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
+/// referenced by a load instruction with the base register that is the Pc.
+/// These can often be values in a literal pool near the Address of the
+/// instruction.  The Address of the instruction and its immediate Value are
+/// used as a possible literal pool entry.  The SymbolLookUp call back will
+/// return the name of a symbol referenced by the the literal pool's entry if
+/// the referenced address is that of a symbol.  Or it will return a pointer to
+/// a literal 'C' string if the referenced address of the literal pool's entry
+/// is an address into a section with 'C' string literals.
+static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
+					    const void *Decoder) {
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
+  if (SymbolLookUp) {
+    void *DisInfo = Dis->getDisInfoBlock();
+    uint64_t ReferenceType;
+    ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+    const char *ReferenceName;
+    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr ||
+       ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
+  }
+}
+
+// Thumb1 instructions don't have explicit S bits.  Rather, they
+// implicitly set CPSR.  Since it's not represented in the encoding, the
+// auto-generated decoder won't inject the CPSR operand.  We need to fix
+// that as a post-pass.
+static void AddThumb1SBit(MCInst &MI, bool InITBlock) {
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isOptionalDef() && OpInfo[i].RegClass == ARM::CCRRegClassID) {
+      if (i > 0 && OpInfo[i-1].isPredicate()) continue;
+      MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+      return;
+    }
+  }
+
+  MI.insert(I, MCOperand::CreateReg(InITBlock ? 0 : ARM::CPSR));
+}
+
+// Most Thumb instructions don't have explicit predicates in the
+// encoding, but rather get their predicates from IT context.  We need
+// to fix up the predicate operands using this context information as a
+// post-pass.
+MCDisassembler::DecodeStatus
+ThumbDisassembler::AddThumbPredicate(MCInst &MI) const {
+  MCDisassembler::DecodeStatus S = Success;
+
+  // A few instructions actually have predicates encoded in them.  Don't
+  // try to overwrite it if we're seeing one of those.
+  switch (MI.getOpcode()) {
+    case ARM::tBcc:
+    case ARM::t2Bcc:
+    case ARM::tCBZ:
+    case ARM::tCBNZ:
+    case ARM::tCPS:
+    case ARM::t2CPS3p:
+    case ARM::t2CPS2p:
+    case ARM::t2CPS1p:
+    case ARM::tMOVSr:
+    case ARM::tSETEND:
+      // Some instructions (mostly conditional branches) are not
+      // allowed in IT blocks.
+      if (!ITBlock.empty())
+        S = SoftFail;
+      else
+        return Success;
+      break;
+    case ARM::tB:
+    case ARM::t2B:
+    case ARM::t2TBB:
+    case ARM::t2TBH:
+      // Some instructions (mostly unconditional branches) can
+      // only appears at the end of, or outside of, an IT.
+      if (ITBlock.size() > 1)
+        S = SoftFail;
+      break;
+    default:
+      break;
+  }
+
+  // If we're in an IT block, base the predicate on that.  Otherwise,
+  // assume a predicate of AL.
+  unsigned CC;
+  if (!ITBlock.empty()) {
+    CC = ITBlock.back();
+    if (CC == 0xF)
+      CC = ARMCC::AL;
+    ITBlock.pop_back();
+  } else
+    CC = ARMCC::AL;
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  MCInst::iterator I = MI.begin();
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (I == MI.end()) break;
+    if (OpInfo[i].isPredicate()) {
+      I = MI.insert(I, MCOperand::CreateImm(CC));
+      ++I;
+      if (CC == ARMCC::AL)
+        MI.insert(I, MCOperand::CreateReg(0));
+      else
+        MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+      return S;
+    }
+  }
+
+  I = MI.insert(I, MCOperand::CreateImm(CC));
+  ++I;
+  if (CC == ARMCC::AL)
+    MI.insert(I, MCOperand::CreateReg(0));
+  else
+    MI.insert(I, MCOperand::CreateReg(ARM::CPSR));
+
+  return S;
+}
+
+// Thumb VFP instructions are a special case.  Because we share their
+// encodings between ARM and Thumb modes, and they are predicable in ARM
+// mode, the auto-generated decoder will give them an (incorrect)
+// predicate operand.  We need to rewrite these operands based on the IT
+// context as a post-pass.
+void ThumbDisassembler::UpdateThumbVFPPredicate(MCInst &MI) const {
+  unsigned CC;
+  if (!ITBlock.empty()) {
+    CC = ITBlock.back();
+    ITBlock.pop_back();
+  } else
+    CC = ARMCC::AL;
+
+  const MCOperandInfo *OpInfo = ARMInsts[MI.getOpcode()].OpInfo;
+  MCInst::iterator I = MI.begin();
+  unsigned short NumOps = ARMInsts[MI.getOpcode()].NumOperands;
+  for (unsigned i = 0; i < NumOps; ++i, ++I) {
+    if (OpInfo[i].isPredicate() ) {
+      I->setImm(CC);
+      ++I;
+      if (CC == ARMCC::AL)
+        I->setReg(0);
+      else
+        I->setReg(ARM::CPSR);
+      return;
+    }
+  }
+}
+
+DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
+  CommentStream = &cs;
+
+  uint8_t bytes[4];
+
+  assert((STI.getFeatureBits() & ARM::ModeThumb) &&
+         "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
+
+  // We want to read exactly 2 bytes of data.
+  if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint16_t insn16 = (bytes[1] << 8) | bytes[0];
+  DecodeStatus result = decodeThumbInstruction16(MI, insn16, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  MI.clear();
+  result = decodeThumbSBitInstruction16(MI, insn16, Address, this, STI);
+  if (result) {
+    Size = 2;
+    bool InITBlock = !ITBlock.empty();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeThumb2Instruction16(MI, insn16, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 2;
+
+    // Nested IT blocks are UNPREDICTABLE.  Must be checked before we add
+    // the Thumb predicate.
+    if (MI.getOpcode() == ARM::t2IT && !ITBlock.empty())
+      result = MCDisassembler::SoftFail;
+
+    Check(result, AddThumbPredicate(MI));
+
+    // If we find an IT instruction, we need to parse its condition
+    // code and mask operands so that we can apply them correctly
+    // to the subsequent instructions.
+    if (MI.getOpcode() == ARM::t2IT) {
+
+      // (3 - the number of trailing zeros) is the number of then / else.
+      unsigned firstcond = MI.getOperand(0).getImm();
+      unsigned Mask = MI.getOperand(1).getImm();
+      unsigned CondBit0 = Mask >> 4 & 1;
+      unsigned NumTZ = CountTrailingZeros_32(Mask);
+      assert(NumTZ <= 3 && "Invalid IT mask!");
+      for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+        bool T = ((Mask >> Pos) & 1) == CondBit0;
+        if (T)
+          ITBlock.insert(ITBlock.begin(), firstcond);
+        else
+          ITBlock.insert(ITBlock.begin(), firstcond ^ 1);
       }
+
+      ITBlock.push_back(firstcond);
+    }
+
+    return result;
+  }
+
+  // We want to read exactly 4 bytes of data.
+  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  uint32_t insn32 = (bytes[3] <<  8) |
+                    (bytes[2] <<  0) |
+                    (bytes[1] << 24) |
+                    (bytes[0] << 16);
+  MI.clear();
+  result = decodeThumbInstruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    bool InITBlock = ITBlock.size();
+    Check(result, AddThumbPredicate(MI));
+    AddThumb1SBit(MI, InITBlock);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeThumb2Instruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  MI.clear();
+  result = decodeVFPInstruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    UpdateThumbVFPPredicate(MI);
+    return result;
+  }
+
+  MI.clear();
+  result = decodeNEONDupInstruction32(MI, insn32, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    Check(result, AddThumbPredicate(MI));
+    return result;
+  }
+
+  if (fieldFromInstruction32(insn32, 24, 8) == 0xF9) {
+    MI.clear();
+    uint32_t NEONLdStInsn = insn32;
+    NEONLdStInsn &= 0xF0FFFFFF;
+    NEONLdStInsn |= 0x04000000;
+    result = decodeNEONLoadStoreInstruction32(MI, NEONLdStInsn, Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
+  if (fieldFromInstruction32(insn32, 24, 4) == 0xF) {
+    MI.clear();
+    uint32_t NEONDataInsn = insn32;
+    NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONDataInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeNEONDataInstruction32(MI, NEONDataInsn, Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
+  Size = 0;
+  return MCDisassembler::Fail;
+}
+
+
+extern "C" void LLVMInitializeARMDisassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheARMTarget,
+                                         createARMDisassembler);
+  TargetRegistry::RegisterMCDisassembler(TheThumbTarget,
+                                         createThumbDisassembler);
+}
+
+static const unsigned GPRDecoderTable[] = {
+  ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+  ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+  ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+  ARM::R12, ARM::SP, ARM::LR, ARM::PC
+};
+
+static DecodeStatus DecodeGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeGPRnopcRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                           uint64_t Address, const void *Decoder) {
+  if (RegNo == 15) return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodetGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodetcGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned Register = 0;
+  switch (RegNo) {
+    case 0:
+      Register = ARM::R0;
+      break;
+    case 1:
+      Register = ARM::R1;
+      break;
+    case 2:
+      Register = ARM::R2;
       break;
     case 3:
-      switch (slice(insn, 7, 4)) {
-      case 11:
-        return ARM::LDRHT;
-      case 13:
-        return ARM::LDRSBT;
-      case 15:
-        return ARM::LDRSHT;
-      default:
-        break; // fallthrough
-      }
+      Register = ARM::R3;
+      break;
+    case 9:
+      Register = ARM::R9;
+      break;
+    case 12:
+      Register = ARM::R12;
       break;
     default:
-      break;   // fallthrough
+      return MCDisassembler::Fail;
     }
+
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecoderGPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo == 13 || RegNo == 15) return MCDisassembler::Fail;
+  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const unsigned SPRDecoderTable[] = {
+     ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+     ARM::S8,  ARM::S9, ARM::S10, ARM::S11,
+    ARM::S12, ARM::S13, ARM::S14, ARM::S15,
+    ARM::S16, ARM::S17, ARM::S18, ARM::S19,
+    ARM::S20, ARM::S21, ARM::S22, ARM::S23,
+    ARM::S24, ARM::S25, ARM::S26, ARM::S27,
+    ARM::S28, ARM::S29, ARM::S30, ARM::S31
+};
+
+static DecodeStatus DecodeSPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = SPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static const unsigned DPRDecoderTable[] = {
+     ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+     ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+     ARM::D8,  ARM::D9, ARM::D10, ARM::D11,
+    ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+    ARM::D16, ARM::D17, ARM::D18, ARM::D19,
+    ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+    ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+    ARM::D28, ARM::D29, ARM::D30, ARM::D31
+};
+
+static DecodeStatus DecodeDPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = DPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeDPR_8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeDPR_VFP2RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+  return DecodeDPRRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static const unsigned QPRDecoderTable[] = {
+     ARM::Q0,  ARM::Q1,  ARM::Q2,  ARM::Q3,
+     ARM::Q4,  ARM::Q5,  ARM::Q6,  ARM::Q7,
+     ARM::Q8,  ARM::Q9, ARM::Q10, ARM::Q11,
+    ARM::Q12, ARM::Q13, ARM::Q14, ARM::Q15
+};
+
+
+static DecodeStatus DecodeQPRRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  RegNo >>= 1;
+
+  unsigned Register = QPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePredicateOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val == 0xF) return MCDisassembler::Fail;
+  // AL predicate is not allowed on Thumb1 branches.
+  if (Inst.getOpcode() == ARM::tBcc && Val == 0xE)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  if (Val == ARMCC::AL) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+  } else
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCCOutOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  if (Val)
+    Inst.addOperand(MCOperand::CreateReg(ARM::CPSR));
+  else
+    Inst.addOperand(MCOperand::CreateReg(0));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSOImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  uint32_t imm = Val & 0xFF;
+  uint32_t rot = (Val & 0xF00) >> 7;
+  uint32_t rot_imm = (imm >> rot) | (imm << ((32-rot) & 0x1F));
+  Inst.addOperand(MCOperand::CreateImm(rot_imm));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSORegImmOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction32(Val, 0, 4);
+  unsigned type = fieldFromInstruction32(Val, 5, 2);
+  unsigned imm = fieldFromInstruction32(Val, 7, 5);
+
+  // Register-immediate
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
+      break;
+    case 1:
+      Shift = ARM_AM::lsr;
+      break;
+    case 2:
+      Shift = ARM_AM::asr;
+      break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
   }
 
-  // Ditto for SBCrs, which is a super-instruction for A8.6.152 & A8.6.153.
-  // As a result, the decoder fails to decode STRH_Post/LDRD_POST/STRD_POST
-  // properly.
-  if (slice(insn, 27, 25) == 0 && slice(insn, 20, 20) == 0) {
-    unsigned PW = slice(insn, 24, 24) << 1 | slice(insn, 21, 21);
-    switch (slice(insn, 7, 4)) {
-    case 11:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::STRH;
-      case 3: // Pre-indexed
-        return ARM::STRH_PRE;
-      case 0: // Post-indexed
-        return ARM::STRH_POST;
-      default:
-        break; // fallthrough
-      }
+  if (Shift == ARM_AM::ror && imm == 0)
+    Shift = ARM_AM::rrx;
+
+  unsigned Op = Shift | (imm << 3);
+  Inst.addOperand(MCOperand::CreateImm(Op));
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegRegOperand(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction32(Val, 0, 4);
+  unsigned type = fieldFromInstruction32(Val, 5, 2);
+  unsigned Rs = fieldFromInstruction32(Val, 8, 4);
+
+  // Register-register
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rs, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::ShiftOpc Shift = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      Shift = ARM_AM::lsl;
       break;
-    case 13:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRD;
-      case 3: // Pre-indexed
-        return ARM::LDRD_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRD_POST;
-      default:
-        break; // fallthrough
-      }
+    case 1:
+      Shift = ARM_AM::lsr;
       break;
-    case 15:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::STRD;
-      case 3: // Pre-indexed
-        return ARM::STRD_PRE;
-      case 0: // Post-indexed
-        return ARM::STRD_POST;
-      default:
-        break; // fallthrough
-      }
+    case 2:
+      Shift = ARM_AM::asr;
       break;
+    case 3:
+      Shift = ARM_AM::ror;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(Shift));
+
+  return S;
+}
+
+static DecodeStatus DecodeRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  bool writebackLoad = false;
+  unsigned writebackReg = 0;
+  switch (Inst.getOpcode()) {
     default:
-      break; // fallthrough
+      break;
+    case ARM::LDMIA_UPD:
+    case ARM::LDMDB_UPD:
+    case ARM::LDMIB_UPD:
+    case ARM::LDMDA_UPD:
+    case ARM::t2LDMIA_UPD:
+    case ARM::t2LDMDB_UPD:
+      writebackLoad = true;
+      writebackReg = Inst.getOperand(0).getReg();
+      break;
+  }
+
+  // Empty register lists are not allowed.
+  if (CountPopulation_32(Val) == 0) return MCDisassembler::Fail;
+  for (unsigned i = 0; i < 16; ++i) {
+    if (Val & (1 << i)) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
+        return MCDisassembler::Fail;
+      // Writeback not allowed if Rn is in the target list.
+      if (writebackLoad && writebackReg == Inst.end()[-1].getReg())
+        Check(S, MCDisassembler::SoftFail);
     }
   }
 
-  // Ditto for SBCSSrs, which is a super-instruction for A8.6.152 & A8.6.153.
-  // As a result, the decoder fails to decode LDRH_POST/LDRSB_POST/LDRSH_POST
-  // properly.
-  if (slice(insn, 27, 25) == 0 && slice(insn, 20, 20) == 1) {
-    unsigned PW = slice(insn, 24, 24) << 1 | slice(insn, 21, 21);
-    switch (slice(insn, 7, 4)) {
-    case 11:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRH;
-      case 3: // Pre-indexed
-        return ARM::LDRH_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRH_POST;
-      default:
-        break; // fallthrough
-      }
+  return S;
+}
+
+static DecodeStatus DecodeSPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
+  unsigned regs = Val & 0xFF;
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeSPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeDPRRegListOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Vd = fieldFromInstruction32(Val, 8, 4);
+  unsigned regs = (Val & 0xFF) / 2;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  for (unsigned i = 0; i < (regs - 1); ++i) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, ++Vd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeBitfieldMaskOperand(llvm::MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  // This operand encodes a mask of contiguous zeros between a specified MSB
+  // and LSB.  To decode it, we create the mask of all bits MSB-and-lower,
+  // the mask of all bits LSB-and-lower, and then xor them to create
+  // the mask of that's all ones on [msb, lsb].  Finally we not it to
+  // create the final mask.
+  unsigned msb = fieldFromInstruction32(Val, 5, 5);
+  unsigned lsb = fieldFromInstruction32(Val, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+  if (lsb > msb) Check(S, MCDisassembler::SoftFail);
+
+  uint32_t msb_mask = 0xFFFFFFFF;
+  if (msb != 31) msb_mask = (1U << (msb+1)) - 1;
+  uint32_t lsb_mask = (1U << lsb) - 1;
+
+  Inst.addOperand(MCOperand::CreateImm(~(msb_mask ^ lsb_mask)));
+  return S;
+}
+
+static DecodeStatus DecodeCopMemInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned CRd = fieldFromInstruction32(Insn, 12, 4);
+  unsigned coproc = fieldFromInstruction32(Insn, 8, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 8);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned U = fieldFromInstruction32(Insn, 23, 1);
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDC_OPTION:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2LDCL_POST:
+    case ARM::t2LDCL_OPTION:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STC_POST:
+    case ARM::t2STC_OPTION:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STCL_PRE:
+    case ARM::t2STCL_POST:
+    case ARM::t2STCL_OPTION:
+      if (coproc == 0xA || coproc == 0xB)
+        return MCDisassembler::Fail;
+      break;
+    default:
       break;
-    case 13:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRSB;
-      case 3: // Pre-indexed
-        return ARM::LDRSB_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRSB_POST;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(coproc));
+  Inst.addOperand(MCOperand::CreateImm(CRd));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+
+  bool writeback = (P == 0) || (W == 1);
+  unsigned idx_mode = 0;
+  if (P && writeback)
+    idx_mode = ARMII::IndexModePre;
+  else if (!P && writeback)
+    idx_mode = ARMII::IndexModePost;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDC2_OFFSET:
+    case ARM::t2LDC2L_OFFSET:
+    case ARM::t2LDC2_PRE:
+    case ARM::t2LDC2L_PRE:
+    case ARM::t2STC2_OFFSET:
+    case ARM::t2STC2L_OFFSET:
+    case ARM::t2STC2_PRE:
+    case ARM::t2STC2L_PRE:
+    case ARM::LDC2_OFFSET:
+    case ARM::LDC2L_OFFSET:
+    case ARM::LDC2_PRE:
+    case ARM::LDC2L_PRE:
+    case ARM::STC2_OFFSET:
+    case ARM::STC2L_OFFSET:
+    case ARM::STC2_PRE:
+    case ARM::STC2L_PRE:
+    case ARM::t2LDC_OFFSET:
+    case ARM::t2LDCL_OFFSET:
+    case ARM::t2LDC_PRE:
+    case ARM::t2LDCL_PRE:
+    case ARM::t2STC_OFFSET:
+    case ARM::t2STCL_OFFSET:
+    case ARM::t2STC_PRE:
+    case ARM::t2STCL_PRE:
+    case ARM::LDC_OFFSET:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDCL_PRE:
+    case ARM::STC_OFFSET:
+    case ARM::STCL_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STCL_PRE:
+      imm = ARM_AM::getAM5Opc(U ? ARM_AM::add : ARM_AM::sub, imm);
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+    case ARM::t2LDC2_POST:
+    case ARM::t2LDC2L_POST:
+    case ARM::t2STC2_POST:
+    case ARM::t2STC2L_POST:
+    case ARM::LDC2_POST:
+    case ARM::LDC2L_POST:
+    case ARM::STC2_POST:
+    case ARM::STC2L_POST:
+    case ARM::t2LDC_POST:
+    case ARM::t2LDCL_POST:
+    case ARM::t2STC_POST:
+    case ARM::t2STCL_POST:
+    case ARM::LDC_POST:
+    case ARM::LDCL_POST:
+    case ARM::STC_POST:
+    case ARM::STCL_POST:
+      imm |= U << 8;
+      // fall through.
+    default:
+      // The 'option' variant doesn't encode 'U' in the immediate since
+      // the immediate is unsigned [0,255].
+      Inst.addOperand(MCOperand::CreateImm(imm));
+      break;
+  }
+
+  switch (Inst.getOpcode()) {
+    case ARM::LDC_OFFSET:
+    case ARM::LDC_PRE:
+    case ARM::LDC_POST:
+    case ARM::LDC_OPTION:
+    case ARM::LDCL_OFFSET:
+    case ARM::LDCL_PRE:
+    case ARM::LDCL_POST:
+    case ARM::LDCL_OPTION:
+    case ARM::STC_OFFSET:
+    case ARM::STC_PRE:
+    case ARM::STC_POST:
+    case ARM::STC_OPTION:
+    case ARM::STCL_OFFSET:
+    case ARM::STCL_PRE:
+    case ARM::STCL_POST:
+    case ARM::STCL_OPTION:
+      if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode2IdxInstruction(llvm::MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned reg = fieldFromInstruction32(Insn, 25, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+
+  // On stores, the writeback operand precedes Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::STR_POST_IMM:
+    case ARM::STR_POST_REG:
+    case ARM::STRB_POST_IMM:
+    case ARM::STRB_POST_REG:
+    case ARM::STRT_POST_REG:
+    case ARM::STRT_POST_IMM:
+    case ARM::STRBT_POST_REG:
+    case ARM::STRBT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // On loads, the writeback operand comes after Rt.
+  switch (Inst.getOpcode()) {
+    case ARM::LDR_POST_IMM:
+    case ARM::LDR_POST_REG:
+    case ARM::LDRB_POST_IMM:
+    case ARM::LDRB_POST_REG:
+    case ARM::LDRBT_POST_REG:
+    case ARM::LDRBT_POST_IMM:
+    case ARM::LDRT_POST_REG:
+    case ARM::LDRT_POST_IMM:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  ARM_AM::AddrOpc Op = ARM_AM::add;
+  if (!fieldFromInstruction32(Insn, 23, 1))
+    Op = ARM_AM::sub;
+
+  bool writeback = (P == 0) || (W == 1);
+  unsigned idx_mode = 0;
+  if (P && writeback)
+    idx_mode = ARMII::IndexModePre;
+  else if (!P && writeback)
+    idx_mode = ARMII::IndexModePost;
+
+  if (writeback && (Rn == 15 || Rn == Rt))
+    S = MCDisassembler::SoftFail; // UNPREDICTABLE
+
+  if (reg) {
+    if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+    ARM_AM::ShiftOpc Opc = ARM_AM::lsl;
+    switch( fieldFromInstruction32(Insn, 5, 2)) {
+      case 0:
+        Opc = ARM_AM::lsl;
+        break;
+      case 1:
+        Opc = ARM_AM::lsr;
+        break;
+      case 2:
+        Opc = ARM_AM::asr;
+        break;
+      case 3:
+        Opc = ARM_AM::ror;
+        break;
       default:
-        break; // fallthrough
-      }
+        return MCDisassembler::Fail;
+    }
+    unsigned amt = fieldFromInstruction32(Insn, 7, 5);
+    unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
+
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  } else {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    unsigned tmp = ARM_AM::getAM2Opc(Op, imm, ARM_AM::lsl, idx_mode);
+    Inst.addOperand(MCOperand::CreateImm(tmp));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSORegMemOperand(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 13, 4);
+  unsigned Rm = fieldFromInstruction32(Val,  0, 4);
+  unsigned type = fieldFromInstruction32(Val, 5, 2);
+  unsigned imm = fieldFromInstruction32(Val, 7, 5);
+  unsigned U = fieldFromInstruction32(Val, 12, 1);
+
+  ARM_AM::ShiftOpc ShOp = ARM_AM::lsl;
+  switch (type) {
+    case 0:
+      ShOp = ARM_AM::lsl;
+      break;
+    case 1:
+      ShOp = ARM_AM::lsr;
+      break;
+    case 2:
+      ShOp = ARM_AM::asr;
+      break;
+    case 3:
+      ShOp = ARM_AM::ror;
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  unsigned shift;
+  if (U)
+    shift = ARM_AM::getAM2Opc(ARM_AM::add, imm, ShOp);
+  else
+    shift = ARM_AM::getAM2Opc(ARM_AM::sub, imm, ShOp);
+  Inst.addOperand(MCOperand::CreateImm(shift));
+
+  return S;
+}
+
+static DecodeStatus
+DecodeAddrMode3Instruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned type = fieldFromInstruction32(Insn, 22, 1);
+  unsigned imm = fieldFromInstruction32(Insn, 8, 4);
+  unsigned U = ((~fieldFromInstruction32(Insn, 23, 1)) & 1) << 8;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+
+  bool writeback = (W == 1) | (P == 0);
+
+  // For {LD,ST}RD, Rt must be even, else undefined.
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (Rt & 0x1) return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  if (writeback) { // Writeback
+    if (P)
+      U |= ARMII::IndexModePre << 9;
+    else
+      U |= ARMII::IndexModePost << 9;
+
+    // On stores, the writeback operand precedes Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::STRH:
+    case ARM::STRH_PRE:
+    case ARM::STRH_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  switch (Inst.getOpcode()) {
+    case ARM::STRD:
+    case ARM::STRD_PRE:
+    case ARM::STRD_POST:
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+        return MCDisassembler::Fail;
       break;
-    case 15:
-      switch (PW) {
-      case 2: // Offset
-        return ARM::LDRSH;
-      case 3: // Pre-indexed
-        return ARM::LDRSH_PRE;
-      case 0: // Post-indexed
-        return ARM::LDRSH_POST;
+    default:
+      break;
+  }
+
+  if (writeback) {
+    // On loads, the writeback operand comes after Rt.
+    switch (Inst.getOpcode()) {
+    case ARM::LDRD:
+    case ARM::LDRD_PRE:
+    case ARM::LDRD_POST:
+    case ARM::LDRH:
+    case ARM::LDRH_PRE:
+    case ARM::LDRH_POST:
+    case ARM::LDRSH:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB:
+    case ARM::LDRSB_PRE:
+    case ARM::LDRSB_POST:
+    case ARM::LDRHTr:
+    case ARM::LDRSBTr:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (type) {
+    Inst.addOperand(MCOperand::CreateReg(0));
+    Inst.addOperand(MCOperand::CreateImm(U | (imm << 4) | Rm));
+  } else {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateImm(U));
+  }
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeRFEInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned mode = fieldFromInstruction32(Insn, 23, 2);
+
+  switch (mode) {
+    case 0:
+      mode = ARM_AM::da;
+      break;
+    case 1:
+      mode = ARM_AM::ia;
+      break;
+    case 2:
+      mode = ARM_AM::db;
+      break;
+    case 3:
+      mode = ARM_AM::ib;
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(mode));
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeMemMultipleWritebackInstruction(llvm::MCInst &Inst,
+                                  unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned reglist = fieldFromInstruction32(Insn, 0, 16);
+
+  if (pred == 0xF) {
+    switch (Inst.getOpcode()) {
+      case ARM::LDMDA:
+        Inst.setOpcode(ARM::RFEDA);
+        break;
+      case ARM::LDMDA_UPD:
+        Inst.setOpcode(ARM::RFEDA_UPD);
+        break;
+      case ARM::LDMDB:
+        Inst.setOpcode(ARM::RFEDB);
+        break;
+      case ARM::LDMDB_UPD:
+        Inst.setOpcode(ARM::RFEDB_UPD);
+        break;
+      case ARM::LDMIA:
+        Inst.setOpcode(ARM::RFEIA);
+        break;
+      case ARM::LDMIA_UPD:
+        Inst.setOpcode(ARM::RFEIA_UPD);
+        break;
+      case ARM::LDMIB:
+        Inst.setOpcode(ARM::RFEIB);
+        break;
+      case ARM::LDMIB_UPD:
+        Inst.setOpcode(ARM::RFEIB_UPD);
+        break;
+      case ARM::STMDA:
+        Inst.setOpcode(ARM::SRSDA);
+        break;
+      case ARM::STMDA_UPD:
+        Inst.setOpcode(ARM::SRSDA_UPD);
+        break;
+      case ARM::STMDB:
+        Inst.setOpcode(ARM::SRSDB);
+        break;
+      case ARM::STMDB_UPD:
+        Inst.setOpcode(ARM::SRSDB_UPD);
+        break;
+      case ARM::STMIA:
+        Inst.setOpcode(ARM::SRSIA);
+        break;
+      case ARM::STMIA_UPD:
+        Inst.setOpcode(ARM::SRSIA_UPD);
+        break;
+      case ARM::STMIB:
+        Inst.setOpcode(ARM::SRSIB);
+        break;
+      case ARM::STMIB_UPD:
+        Inst.setOpcode(ARM::SRSIB_UPD);
+        break;
       default:
-        break; // fallthrough
-      }
+        if (!Check(S, MCDisassembler::Fail)) return MCDisassembler::Fail;
+    }
+
+    // For stores (which become SRS's, the only operand is the mode.
+    if (fieldFromInstruction32(Insn, 20, 1) == 0) {
+      Inst.addOperand(
+          MCOperand::CreateImm(fieldFromInstruction32(Insn, 0, 4)));
+      return S;
+    }
+
+    return DecodeRFEInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail; // Tied
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeRegListOperand(Inst, reglist, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeCPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction32(Insn, 18, 2);
+  unsigned M = fieldFromInstruction32(Insn, 17, 1);
+  unsigned iflags = fieldFromInstruction32(Insn, 6, 3);
+  unsigned mode = fieldFromInstruction32(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2CPSInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction32(Insn, 9, 2);
+  unsigned M = fieldFromInstruction32(Insn, 8, 1);
+  unsigned iflags = fieldFromInstruction32(Insn, 5, 3);
+  unsigned mode = fieldFromInstruction32(Insn, 0, 5);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // imod == '01' --> UNPREDICTABLE
+  // NOTE: Even though this is technically UNPREDICTABLE, we choose to
+  // return failure here.  The '01' imod value is unprintable, so there's
+  // nothing useful we could do even if we returned UNPREDICTABLE.
+
+  if (imod == 1) return MCDisassembler::Fail;
+
+  if (imod && M) {
+    Inst.setOpcode(ARM::t2CPS3p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    Inst.addOperand(MCOperand::CreateImm(mode));
+  } else if (imod && !M) {
+    Inst.setOpcode(ARM::t2CPS2p);
+    Inst.addOperand(MCOperand::CreateImm(imod));
+    Inst.addOperand(MCOperand::CreateImm(iflags));
+    if (mode) S = MCDisassembler::SoftFail;
+  } else if (!imod && M) {
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    if (iflags) S = MCDisassembler::SoftFail;
+  } else {
+    // imod == '00' && M == '0' --> UNPREDICTABLE
+    Inst.setOpcode(ARM::t2CPS1p);
+    Inst.addOperand(MCOperand::CreateImm(mode));
+    S = MCDisassembler::SoftFail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeT2MOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 8, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction32(Insn, 0, 8) << 0);
+  imm |= (fieldFromInstruction32(Insn, 12, 3) << 8);
+  imm |= (fieldFromInstruction32(Insn, 16, 4) << 12);
+  imm |= (fieldFromInstruction32(Insn, 26, 1) << 11);
+
+  if (Inst.getOpcode() == ARM::t2MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeArmMOVTWInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned imm = 0;
+
+  imm |= (fieldFromInstruction32(Insn, 0, 12) << 0);
+  imm |= (fieldFromInstruction32(Insn, 16, 4) << 12);
+
+  if (Inst.getOpcode() == ARM::MOVTi16)
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+      return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!tryAddingSymbolicOperand(Address, imm, false, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSMLAInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 8, 4);
+  unsigned Ra = fieldFromInstruction32(Insn, 12, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Ra, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrModeImm12Operand(llvm::MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned add = fieldFromInstruction32(Val, 12, 1);
+  unsigned imm = fieldFromInstruction32(Val, 0, 12);
+  unsigned Rn = fieldFromInstruction32(Val, 13, 4);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (!add) imm *= -1;
+  if (imm == 0 && !add) imm = INT32_MIN;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  if (Rn == 15)
+    tryAddingPcLoadReferenceComment(Address, Address + imm + 8, Decoder);
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode5Operand(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 9, 4);
+  unsigned U = fieldFromInstruction32(Val, 8, 1);
+  unsigned imm = fieldFromInstruction32(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (U)
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::add, imm)));
+  else
+    Inst.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(ARM_AM::sub, imm)));
+
+  return S;
+}
+
+static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeGPRRegisterClass(Inst, Val, Address, Decoder);
+}
+
+static DecodeStatus
+DecodeBranchImmInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 24) << 2;
+
+  if (pred == 0xF) {
+    Inst.setOpcode(ARM::BLXi);
+    imm |= fieldFromInstruction32(Insn, 24, 1) << 1;
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+    return S;
+  }
+
+  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<26>(imm) + 8, true,
+                                4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<26>(imm)));
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+
+static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rm = fieldFromInstruction32(Val, 0, 4);
+  unsigned align = fieldFromInstruction32(Val, 4, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!align)
+    Inst.addOperand(MCOperand::CreateImm(0));
+  else
+    Inst.addOperand(MCOperand::CreateImm(4 << align));
+
+  return S;
+}
+
+static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  Rn |= fieldFromInstruction32(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  // First output register
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // Second output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1q8:
+    case ARM::VLD1q16:
+    case ARM::VLD1q32:
+    case ARM::VLD1q64:
+    case ARM::VLD1q8_UPD:
+    case ARM::VLD1q16_UPD:
+    case ARM::VLD1q32_UPD:
+    case ARM::VLD1q64_UPD:
+    case ARM::VLD1d8T:
+    case ARM::VLD1d16T:
+    case ARM::VLD1d32T:
+    case ARM::VLD1d64T:
+    case ARM::VLD1d8T_UPD:
+    case ARM::VLD1d16T_UPD:
+    case ARM::VLD1d32T_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD1d8Q:
+    case ARM::VLD1d16Q:
+    case ARM::VLD1d32Q:
+    case ARM::VLD1d64Q:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2d8:
+    case ARM::VLD2d16:
+    case ARM::VLD2d32:
+    case ARM::VLD2d8_UPD:
+    case ARM::VLD2d16_UPD:
+    case ARM::VLD2d32_UPD:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
       break;
+    case ARM::VLD2b8:
+    case ARM::VLD2b16:
+    case ARM::VLD2b32:
+    case ARM::VLD2b8_UPD:
+    case ARM::VLD2b16_UPD:
+    case ARM::VLD2b32_UPD:
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
     default:
-      break; // fallthrough
+      break;
+  }
+
+  // Third output register
+  switch(Inst.getOpcode()) {
+    case ARM::VLD1d8T:
+    case ARM::VLD1d16T:
+    case ARM::VLD1d32T:
+    case ARM::VLD1d64T:
+    case ARM::VLD1d8T_UPD:
+    case ARM::VLD1d16T_UPD:
+    case ARM::VLD1d32T_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD1d8Q:
+    case ARM::VLD1d16Q:
+    case ARM::VLD1d32Q:
+    case ARM::VLD1d64Q:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD3d8:
+    case ARM::VLD3d16:
+    case ARM::VLD3d32:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD3q8:
+    case ARM::VLD3q16:
+    case ARM::VLD3q32:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth output register
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1d8Q:
+    case ARM::VLD1d16Q:
+    case ARM::VLD1d32Q:
+    case ARM::VLD1d64Q:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2q8:
+    case ARM::VLD2q16:
+    case ARM::VLD2q32:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD4d8:
+    case ARM::VLD4d16:
+    case ARM::VLD4d32:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VLD4q8:
+    case ARM::VLD4q16:
+    case ARM::VLD4q32:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Writeback operand
+  switch (Inst.getOpcode()) {
+    case ARM::VLD1d8_UPD:
+    case ARM::VLD1d16_UPD:
+    case ARM::VLD1d32_UPD:
+    case ARM::VLD1d64_UPD:
+    case ARM::VLD1q8_UPD:
+    case ARM::VLD1q16_UPD:
+    case ARM::VLD1q32_UPD:
+    case ARM::VLD1q64_UPD:
+    case ARM::VLD1d8T_UPD:
+    case ARM::VLD1d16T_UPD:
+    case ARM::VLD1d32T_UPD:
+    case ARM::VLD1d64T_UPD:
+    case ARM::VLD1d8Q_UPD:
+    case ARM::VLD1d16Q_UPD:
+    case ARM::VLD1d32Q_UPD:
+    case ARM::VLD1d64Q_UPD:
+    case ARM::VLD2d8_UPD:
+    case ARM::VLD2d16_UPD:
+    case ARM::VLD2d32_UPD:
+    case ARM::VLD2q8_UPD:
+    case ARM::VLD2q16_UPD:
+    case ARM::VLD2q32_UPD:
+    case ARM::VLD2b8_UPD:
+    case ARM::VLD2b16_UPD:
+    case ARM::VLD2b32_UPD:
+    case ARM::VLD3d8_UPD:
+    case ARM::VLD3d16_UPD:
+    case ARM::VLD3d32_UPD:
+    case ARM::VLD3q8_UPD:
+    case ARM::VLD3q16_UPD:
+    case ARM::VLD3q32_UPD:
+    case ARM::VLD4d8_UPD:
+    case ARM::VLD4d16_UPD:
+    case ARM::VLD4d32_UPD:
+    case ARM::VLD4q8_UPD:
+    case ARM::VLD4q16_UPD:
+    case ARM::VLD4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSTInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned wb = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  Rn |= fieldFromInstruction32(Insn, 4, 2) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  // Writeback Operand
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8_UPD:
+    case ARM::VST1d16_UPD:
+    case ARM::VST1d32_UPD:
+    case ARM::VST1d64_UPD:
+    case ARM::VST1q8_UPD:
+    case ARM::VST1q16_UPD:
+    case ARM::VST1q32_UPD:
+    case ARM::VST1q64_UPD:
+    case ARM::VST1d8T_UPD:
+    case ARM::VST1d16T_UPD:
+    case ARM::VST1d32T_UPD:
+    case ARM::VST1d64T_UPD:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2d8_UPD:
+    case ARM::VST2d16_UPD:
+    case ARM::VST2d32_UPD:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST2b8_UPD:
+    case ARM::VST2b16_UPD:
+    case ARM::VST2b32_UPD:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, wb, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // AddrMode6 Base (register+alignment)
+  if (!Check(S, DecodeAddrMode6Operand(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // AddrMode6 Offset (register)
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  // First input register
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  // Second input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST1q8:
+    case ARM::VST1q16:
+    case ARM::VST1q32:
+    case ARM::VST1q64:
+    case ARM::VST1q8_UPD:
+    case ARM::VST1q16_UPD:
+    case ARM::VST1q32_UPD:
+    case ARM::VST1q64_UPD:
+    case ARM::VST1d8T:
+    case ARM::VST1d16T:
+    case ARM::VST1d32T:
+    case ARM::VST1d64T:
+    case ARM::VST1d8T_UPD:
+    case ARM::VST1d16T_UPD:
+    case ARM::VST1d32T_UPD:
+    case ARM::VST1d64T_UPD:
+    case ARM::VST1d8Q:
+    case ARM::VST1d16Q:
+    case ARM::VST1d32Q:
+    case ARM::VST1d64Q:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2d8:
+    case ARM::VST2d16:
+    case ARM::VST2d32:
+    case ARM::VST2d8_UPD:
+    case ARM::VST2d16_UPD:
+    case ARM::VST2d32_UPD:
+    case ARM::VST2q8:
+    case ARM::VST2q16:
+    case ARM::VST2q32:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST2b8:
+    case ARM::VST2b16:
+    case ARM::VST2b32:
+    case ARM::VST2b8_UPD:
+    case ARM::VST2b16_UPD:
+    case ARM::VST2b32_UPD:
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Third input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8T:
+    case ARM::VST1d16T:
+    case ARM::VST1d32T:
+    case ARM::VST1d64T:
+    case ARM::VST1d8T_UPD:
+    case ARM::VST1d16T_UPD:
+    case ARM::VST1d32T_UPD:
+    case ARM::VST1d64T_UPD:
+    case ARM::VST1d8Q:
+    case ARM::VST1d16Q:
+    case ARM::VST1d32Q:
+    case ARM::VST1d64Q:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2q8:
+    case ARM::VST2q16:
+    case ARM::VST2q32:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST3d8:
+    case ARM::VST3d16:
+    case ARM::VST3d32:
+    case ARM::VST3d8_UPD:
+    case ARM::VST3d16_UPD:
+    case ARM::VST3d32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST3q8:
+    case ARM::VST3q16:
+    case ARM::VST3q32:
+    case ARM::VST3q8_UPD:
+    case ARM::VST3q16_UPD:
+    case ARM::VST3q32_UPD:
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+4)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  // Fourth input register
+  switch (Inst.getOpcode()) {
+    case ARM::VST1d8Q:
+    case ARM::VST1d16Q:
+    case ARM::VST1d32Q:
+    case ARM::VST1d64Q:
+    case ARM::VST1d8Q_UPD:
+    case ARM::VST1d16Q_UPD:
+    case ARM::VST1d32Q_UPD:
+    case ARM::VST1d64Q_UPD:
+    case ARM::VST2q8:
+    case ARM::VST2q16:
+    case ARM::VST2q32:
+    case ARM::VST2q8_UPD:
+    case ARM::VST2q16_UPD:
+    case ARM::VST2q32_UPD:
+    case ARM::VST4d8:
+    case ARM::VST4d16:
+    case ARM::VST4d32:
+    case ARM::VST4d8_UPD:
+    case ARM::VST4d16_UPD:
+    case ARM::VST4d32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VST4q8:
+    case ARM::VST4q16:
+    case ARM::VST4q32:
+    case ARM::VST4q8_UPD:
+    case ARM::VST4q16_UPD:
+    case ARM::VST4q32_UPD:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+6)%32, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned align = fieldFromInstruction32(Insn, 4, 1);
+  unsigned size = fieldFromInstruction32(Insn, 6, 2);
+  unsigned regs = fieldFromInstruction32(Insn, 5, 1) + 1;
+
+  align *= (1 << size);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (regs == 2) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+1)%32, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD2DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned align = fieldFromInstruction32(Insn, 4, 1);
+  unsigned size = 1 << fieldFromInstruction32(Insn, 6, 2);
+  unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1;
+  align *= 2*size;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD3DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(0));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD4DupInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned size = fieldFromInstruction32(Insn, 6, 2);
+  unsigned inc = fieldFromInstruction32(Insn, 5, 1) + 1;
+  unsigned align = fieldFromInstruction32(Insn, 4, 1);
+
+  if (size == 0x3) {
+    size = 4;
+    align = 16;
+  } else {
+    if (size == 2) {
+      size = 1 << size;
+      align *= 8;
+    } else {
+      size = 1 << size;
+      align *= 4*size;
     }
   }
 
-AutoGenedDecoder:
-  // Calling the auto-generated decoder function.
-  return decodeInstruction(insn);
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+2*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, (Rd+3*inc)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+
+  if (Rm == 0xD)
+    Inst.addOperand(MCOperand::CreateReg(0));
+  else if (Rm != 0xF) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  return S;
 }
 
-// Helper function for special case handling of LDR (literal) and friends.
-// See, for example, A6.3.7 Load word: Table A6-18 Load word.
-// See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode
-// before returning it.
-static unsigned T2Morph2LoadLiteral(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return Opcode; // Return unmorphed opcode.
+static DecodeStatus
+DecodeNEONModImmInstruction(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned imm = fieldFromInstruction32(Insn, 0, 4);
+  imm |= fieldFromInstruction32(Insn, 16, 3) << 4;
+  imm |= fieldFromInstruction32(Insn, 24, 1) << 7;
+  imm |= fieldFromInstruction32(Insn, 8, 4) << 8;
+  imm |= fieldFromInstruction32(Insn, 5, 1) << 12;
+  unsigned Q = fieldFromInstruction32(Insn, 6, 1);
+
+  if (Q) {
+    if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  } else {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  switch (Inst.getOpcode()) {
+    case ARM::VORRiv4i16:
+    case ARM::VORRiv2i32:
+    case ARM::VBICiv4i16:
+    case ARM::VBICiv2i32:
+      if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    case ARM::VORRiv8i16:
+    case ARM::VORRiv4i32:
+    case ARM::VBICiv8i16:
+    case ARM::VBICiv4i32:
+      if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+        return MCDisassembler::Fail;
+      break;
+    default:
+      break;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeVSHLMaxInstruction(llvm::MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 18, 2);
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(8 << size));
+
+  return S;
+}
+
+static DecodeStatus DecodeShiftRight8Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(8 - Val));
+  return MCDisassembler::Success;
+}
 
-  case ARM::t2LDR_POST:   case ARM::t2LDR_PRE:
-  case ARM::t2LDRi12:     case ARM::t2LDRi8:
-  case ARM::t2LDRs:       case ARM::t2LDRT:
-    return ARM::t2LDRpci;
+static DecodeStatus DecodeShiftRight16Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(16 - Val));
+  return MCDisassembler::Success;
+}
 
-  case ARM::t2LDRB_POST:  case ARM::t2LDRB_PRE:
-  case ARM::t2LDRBi12:    case ARM::t2LDRBi8:
-  case ARM::t2LDRBs:      case ARM::t2LDRBT:
-    return ARM::t2LDRBpci;
+static DecodeStatus DecodeShiftRight32Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(32 - Val));
+  return MCDisassembler::Success;
+}
 
-  case ARM::t2LDRH_POST:  case ARM::t2LDRH_PRE:
-  case ARM::t2LDRHi12:    case ARM::t2LDRHi8:
-  case ARM::t2LDRHs:      case ARM::t2LDRHT:
-    return ARM::t2LDRHpci;
+static DecodeStatus DecodeShiftRight64Imm(llvm::MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
 
-  case ARM::t2LDRSB_POST:  case ARM::t2LDRSB_PRE:
-  case ARM::t2LDRSBi12:    case ARM::t2LDRSBi8:
-  case ARM::t2LDRSBs:      case ARM::t2LDRSBT:
-    return ARM::t2LDRSBpci;
+static DecodeStatus DecodeTBLInstruction(llvm::MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  Rn |= fieldFromInstruction32(Insn, 7, 1) << 4;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+  unsigned op = fieldFromInstruction32(Insn, 6, 1);
+  unsigned length = fieldFromInstruction32(Insn, 8, 2) + 1;
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (op) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail; // Writeback
+  }
 
-  case ARM::t2LDRSH_POST:  case ARM::t2LDRSH_PRE:
-  case ARM::t2LDRSHi12:    case ARM::t2LDRSHi8:
-  case ARM::t2LDRSHs:      case ARM::t2LDRSHT:
-    return ARM::t2LDRSHpci;
+  for (unsigned i = 0; i < length; ++i) {
+    if (!Check(S, DecodeDPRRegisterClass(Inst, (Rn+i)%32, Address, Decoder)))
+    return MCDisassembler::Fail;
   }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
 }
 
-// Helper function for special case handling of PLD (literal) and friends.
-// See A8.6.117 T1 & T2 and friends for why we morphed the opcode
-// before returning it.
-static unsigned T2Morph2PLDLiteral(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return Opcode; // Return unmorphed opcode.
-
-  case ARM::t2PLDi8:   case ARM::t2PLDs:
-  case ARM::t2PLDWi12: case ARM::t2PLDWi8:
-  case ARM::t2PLDWs:
-    return ARM::t2PLDi12;
-
-  case ARM::t2PLIi8:   case ARM::t2PLIs:
-    return ARM::t2PLIi12;
-  }
-}
-
-/// decodeThumbSideEffect is a decorator function which can potentially twiddle
-/// the instruction or morph the returned opcode under Thumb2.
-///
-/// First it checks whether the insn is a NEON or VFP instr; if true, bit
-/// twiddling could be performed on insn to turn it into an ARM NEON/VFP
-/// equivalent instruction and decodeInstruction is called with the transformed
-/// insn.
-///
-/// Next, there is special handling for Load byte/halfword/word instruction by
-/// checking whether Rn=0b1111 and call T2Morph2LoadLiteral() on the decoded
-/// Thumb2 instruction.  See comments below for further details.
-///
-/// Finally, one last check is made to see whether the insn is a NEON/VFP and
-/// decodeInstruction(insn) is invoked on the original insn.
-///
-/// Otherwise, decodeThumbInstruction is called with the original insn.
-static unsigned decodeThumbSideEffect(bool IsThumb2, unsigned &insn) {
-  if (IsThumb2) {
-    uint16_t op1 = slice(insn, 28, 27);
-    uint16_t op2 = slice(insn, 26, 20);
-
-    // A6.3 32-bit Thumb instruction encoding
-    // Table A6-9 32-bit Thumb instruction encoding
-
-    // The coprocessor instructions of interest are transformed to their ARM
-    // equivalents.
-
-    // --------- Transform Begin Marker ---------
-    if ((op1 == 1 || op1 == 3) && slice(op2, 6, 4) == 7) {
-      // A7.4 Advanced SIMD data-processing instructions
-      // U bit of Thumb corresponds to Inst{24} of ARM.
-      uint16_t U = slice(op1, 1, 1);
-
-      // Inst{28-24} of ARM = {1,0,0,1,U};
-      uint16_t bits28_24 = 9 << 1 | U;
-      DEBUG(showBitVector(errs(), insn));
-      setSlice(insn, 28, 24, bits28_24);
-      return decodeInstruction(insn);
+static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
+                                     uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned dst = fieldFromInstruction16(Insn, 8, 3);
+  unsigned imm = fieldFromInstruction16(Insn, 0, 8);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, dst, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  switch(Inst.getOpcode()) {
+    default:
+      return MCDisassembler::Fail;
+    case ARM::tADR:
+      break; // tADR does not explicitly represent the PC as an operand.
+    case ARM::tADDrSPi:
+      Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+      break;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return S;
+}
+
+static DecodeStatus DecodeThumbBROperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<12>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2BROperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<21>(Val)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbCmpBROperand(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<7>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeRR(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 0, 3);
+  unsigned Rm = fieldFromInstruction32(Val, 3, 3);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModeIS(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 0, 3);
+  unsigned imm = fieldFromInstruction32(Val, 3, 5);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbAddrModePC(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  unsigned imm = Val << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  tryAddingPcLoadReferenceComment(Address, (Address & ~2u) + imm + 4, Decoder);
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddrModeSP(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(Val));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeSOReg(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 6, 4);
+  unsigned Rm = fieldFromInstruction32(Val, 2, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 2);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadShift(llvm::MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  switch (Inst.getOpcode()) {
+    case ARM::t2PLDs:
+    case ARM::t2PLDWs:
+    case ARM::t2PLIs:
+      break;
+    default: {
+      unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+      if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
     }
+  }
 
-    if (op1 == 3 && slice(op2, 6, 4) == 1 && slice(op2, 0, 0) == 0) {
-      // A7.7 Advanced SIMD element or structure load/store instructions
-      // Inst{27-24} of Thumb = 0b1001
-      // Inst{27-24} of ARM   = 0b0100
-      DEBUG(showBitVector(errs(), insn));
-      setSlice(insn, 27, 24, 4);
-      return decodeInstruction(insn);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  if (Rn == 0xF) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBs:
+        Inst.setOpcode(ARM::t2LDRBpci);
+        break;
+      case ARM::t2LDRHs:
+        Inst.setOpcode(ARM::t2LDRHpci);
+        break;
+      case ARM::t2LDRSHs:
+        Inst.setOpcode(ARM::t2LDRSHpci);
+        break;
+      case ARM::t2LDRSBs:
+        Inst.setOpcode(ARM::t2LDRSBpci);
+        break;
+      case ARM::t2PLDs:
+        Inst.setOpcode(ARM::t2PLDi12);
+        Inst.addOperand(MCOperand::CreateReg(ARM::PC));
+        break;
+      default:
+        return MCDisassembler::Fail;
     }
-    // --------- Transform End Marker ---------
-
-    unsigned unmorphed = decodeThumbInstruction(insn);
-
-    // See, for example, A6.3.7 Load word: Table A6-18 Load word.
-    // See A8.6.57 T3, T4 & A8.6.60 T2 and friends for why we morphed the opcode
-    // before returning it to our caller.
-    if (op1 == 3 && slice(op2, 6, 5) == 0 && slice(op2, 0, 0) == 1
-        && slice(insn, 19, 16) == 15) {
-      unsigned morphed = T2Morph2LoadLiteral(unmorphed);
-      if (morphed != unmorphed)
-        return morphed;
+
+    int imm = fieldFromInstruction32(Insn, 0, 12);
+    if (!fieldFromInstruction32(Insn, 23, 1)) imm *= -1;
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+    return S;
+  }
+
+  unsigned addrmode = fieldFromInstruction32(Insn, 4, 2);
+  addrmode |= fieldFromInstruction32(Insn, 0, 4) << 2;
+  addrmode |= fieldFromInstruction32(Insn, 16, 4) << 6;
+  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8S4(llvm::MCInst &Inst, unsigned Val,
+                           uint64_t Address, const void *Decoder) {
+  int imm = Val & 0xFF;
+  if (!(Val & 0x100)) imm *= -1;
+  Inst.addOperand(MCOperand::CreateImm(imm << 2));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2AddrModeImm8s4(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 9, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 9);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8S4(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm0_1020s4(llvm::MCInst &Inst,unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 8, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 8);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Imm8(llvm::MCInst &Inst, unsigned Val,
+                         uint64_t Address, const void *Decoder) {
+  int imm = Val & 0xFF;
+  if (Val == 0)
+    imm = INT32_MIN;
+  else if (!(Val & 0x100))
+    imm *= -1;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+
+static DecodeStatus DecodeT2AddrModeImm8(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 9, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 9);
+
+  // Some instructions always use an additive offset.
+  switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+    case ARM::t2LDRBT:
+    case ARM::t2LDRHT:
+    case ARM::t2LDRSBT:
+    case ARM::t2LDRSHT:
+    case ARM::t2STRT:
+    case ARM::t2STRBT:
+    case ARM::t2STRHT:
+      imm |= 0x100;
+      break;
+    default:
+      break;
+  }
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LdStPre(llvm::MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction32(Insn, 0, 8);
+  addr |= fieldFromInstruction32(Insn, 9, 1) << 8;
+  addr |= Rn << 9;
+  unsigned load = fieldFromInstruction32(Insn, 20, 1);
+
+  if (!load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (load) {
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2AddrModeImm12(llvm::MCInst &Inst, unsigned Val,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Val, 13, 4);
+  unsigned imm = fieldFromInstruction32(Val, 0, 12);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return S;
+}
+
+
+static DecodeStatus DecodeThumbAddSPImm(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned imm = fieldFromInstruction16(Insn, 0, 7);
+
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeThumbAddSPReg(llvm::MCInst &Inst, uint16_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (Inst.getOpcode() == ARM::tADDrSP) {
+    unsigned Rdm = fieldFromInstruction16(Insn, 0, 3);
+    Rdm |= fieldFromInstruction16(Insn, 7, 1) << 3;
+
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rdm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+  } else if (Inst.getOpcode() == ARM::tADDspr) {
+    unsigned Rm = fieldFromInstruction16(Insn, 3, 4);
+
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    Inst.addOperand(MCOperand::CreateReg(ARM::SP));
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbCPS(llvm::MCInst &Inst, uint16_t Insn,
+                           uint64_t Address, const void *Decoder) {
+  unsigned imod = fieldFromInstruction16(Insn, 4, 1) | 0x2;
+  unsigned flags = fieldFromInstruction16(Insn, 0, 3);
+
+  Inst.addOperand(MCOperand::CreateImm(imod));
+  Inst.addOperand(MCOperand::CreateImm(flags));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodePostIdxReg(llvm::MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned add = fieldFromInstruction32(Insn, 4, 1);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(add));
+
+  return S;
+}
+
+static DecodeStatus DecodeThumbBLXOffset(llvm::MCInst &Inst, unsigned Val,
+                                 uint64_t Address, const void *Decoder) {
+  if (!tryAddingSymbolicOperand(Address, 
+                                (Address & ~2u) + SignExtend32<22>(Val << 1) + 4,
+                                true, 4, Inst, Decoder))
+    Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeCoprocessor(llvm::MCInst &Inst, unsigned Val,
+                              uint64_t Address, const void *Decoder) {
+  if (Val == 0xA || Val == 0xB)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbTableBranch(llvm::MCInst &Inst, unsigned Insn,
+                       uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  if (Rn == ARM::SP) S = MCDisassembler::SoftFail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus
+DecodeThumb2BCCInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned pred = fieldFromInstruction32(Insn, 22, 4);
+  if (pred == 0xE || pred == 0xF) {
+    unsigned opc = fieldFromInstruction32(Insn, 4, 28);
+    switch (opc) {
+      default:
+        return MCDisassembler::Fail;
+      case 0xf3bf8f4:
+        Inst.setOpcode(ARM::t2DSB);
+        break;
+      case 0xf3bf8f5:
+        Inst.setOpcode(ARM::t2DMB);
+        break;
+      case 0xf3bf8f6:
+        Inst.setOpcode(ARM::t2ISB);
+        break;
     }
 
-    // See, for example, A8.6.117 PLD,PLDW (immediate) T1 & T2, and friends for
-    // why we morphed the opcode before returning it to our caller.
-    if (slice(insn, 31, 25) == 0x7C && slice(insn, 15, 12) == 0xF
-        && slice(insn, 22, 22) == 0 && slice(insn, 20, 20) == 1
-        && slice(insn, 19, 16) == 15) {
-      unsigned morphed = T2Morph2PLDLiteral(unmorphed);
-      if (morphed != unmorphed)
-        return morphed;
+    unsigned imm = fieldFromInstruction32(Insn, 0, 4);
+    return DecodeMemBarrierOption(Inst, imm, Address, Decoder);
+  }
+
+  unsigned brtarget = fieldFromInstruction32(Insn, 0, 11) << 1;
+  brtarget |= fieldFromInstruction32(Insn, 11, 1) << 19;
+  brtarget |= fieldFromInstruction32(Insn, 13, 1) << 18;
+  brtarget |= fieldFromInstruction32(Insn, 16, 6) << 12;
+  brtarget |= fieldFromInstruction32(Insn, 26, 1) << 20;
+
+  if (!Check(S, DecodeT2BROperand(Inst, brtarget, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+// Decode a shifted immediate operand.  These basically consist
+// of an 8-bit value, and a 4-bit directive that specifies either
+// a splat operation or a rotation.
+static DecodeStatus DecodeT2SOImm(llvm::MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  unsigned ctrl = fieldFromInstruction32(Val, 10, 2);
+  if (ctrl == 0) {
+    unsigned byte = fieldFromInstruction32(Val, 8, 2);
+    unsigned imm = fieldFromInstruction32(Val, 0, 8);
+    switch (byte) {
+      case 0:
+        Inst.addOperand(MCOperand::CreateImm(imm));
+        break;
+      case 1:
+        Inst.addOperand(MCOperand::CreateImm((imm << 16) | imm));
+        break;
+      case 2:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 8)));
+        break;
+      case 3:
+        Inst.addOperand(MCOperand::CreateImm((imm << 24) | (imm << 16) |
+                                             (imm << 8)  |  imm));
+        break;
     }
+  } else {
+    unsigned unrot = fieldFromInstruction32(Val, 0, 7) | 0x80;
+    unsigned rot = fieldFromInstruction32(Val, 7, 5);
+    unsigned imm = (unrot >> rot) | (unrot << ((32-rot)&31));
+    Inst.addOperand(MCOperand::CreateImm(imm));
+  }
 
-    // One last check for NEON/VFP instructions.
-    if ((op1 == 1 || op1 == 3) && slice(op2, 6, 6) == 1)
-      return decodeInstruction(insn);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+DecodeThumbBCCTargetOperand(llvm::MCInst &Inst, unsigned Val,
+                            uint64_t Address, const void *Decoder){
+  Inst.addOperand(MCOperand::CreateImm(Val << 1));
+  return MCDisassembler::Success;
+}
 
-    // Fall through.
+static DecodeStatus DecodeThumbBLTargetOperand(llvm::MCInst &Inst, unsigned Val,
+                                       uint64_t Address, const void *Decoder){
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<22>(Val << 1)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemBarrierOption(llvm::MCInst &Inst, unsigned Val,
+                                   uint64_t Address, const void *Decoder) {
+  switch (Val) {
+  default:
+    return MCDisassembler::Fail;
+  case 0xF: // SY
+  case 0xE: // ST
+  case 0xB: // ISH
+  case 0xA: // ISHST
+  case 0x7: // NSH
+  case 0x6: // NSHST
+  case 0x3: // OSH
+  case 0x2: // OSHST
+    break;
   }
 
-  return decodeThumbInstruction(insn);
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
 }
 
-//
-// Public interface for the disassembler
-//
+static DecodeStatus DecodeMSRMask(llvm::MCInst &Inst, unsigned Val,
+                          uint64_t Address, const void *Decoder) {
+  if (!Val) return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
 
-bool ARMDisassembler::getInstruction(MCInst &MI,
-                                     uint64_t &Size,
-                                     const MemoryObject &Region,
-                                     uint64_t Address,
-                                     raw_ostream &os) const {
-  // The machine instruction.
-  uint32_t insn;
-  uint8_t bytes[4];
+static DecodeStatus DecodeDoubleRegLoad(llvm::MCInst &Inst, unsigned Insn,
+                                        uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
 
-  // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1)
-    return false;
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
 
-  // Encoded as a small-endian 32-bit word in the stream.
-  insn = (bytes[3] << 24) |
-         (bytes[2] << 16) |
-         (bytes[1] <<  8) |
-         (bytes[0] <<  0);
-
-  unsigned Opcode = decodeARMInstruction(insn);
-  ARMFormat Format = ARMFormats[Opcode];
-  Size = 4;
-
-  DEBUG({
-      errs() << "\nOpcode=" << Opcode << " Name=" <<ARMUtils::OpcodeName(Opcode)
-             << " Format=" << stringForARMFormat(Format) << '(' << (int)Format
-             << ")\n";
-      showBitVector(errs(), insn);
-    });
-
-  OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format));
-  if (!Builder)
-    return false;
+  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
 
-  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
-                                              getDisInfoBlock(), getMCContext(),
-                                              Address);
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
 
-  if (!Builder->Build(MI, insn))
-    return false;
+  return S;
+}
 
-  return true;
+
+static DecodeStatus DecodeDoubleRegStore(llvm::MCInst &Inst, unsigned Insn,
+                                         uint64_t Address, const void *Decoder){
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
+  if (Rd == Rn || Rd == Rt || Rd == Rt+1) return MCDisassembler::Fail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
 }
 
-bool ThumbDisassembler::getInstruction(MCInst &MI,
-                                       uint64_t &Size,
-                                       const MemoryObject &Region,
-                                       uint64_t Address,
-                                       raw_ostream &os) const {
-  // The Thumb instruction stream is a sequence of halfwords.
-
-  // This represents the first halfword as well as the machine instruction
-  // passed to decodeThumbInstruction().  For 16-bit Thumb instruction, the top
-  // halfword of insn is 0x00 0x00; otherwise, the first halfword is moved to
-  // the top half followed by the second halfword.
-  unsigned insn = 0;
-  // Possible second halfword.
-  uint16_t insn1 = 0;
-
-  // A6.1 Thumb instruction set encoding
-  //
-  // If bits [15:11] of the halfword being decoded take any of the following
-  // values, the halfword is the first halfword of a 32-bit instruction:
-  // o 0b11101
-  // o 0b11110
-  // o 0b11111.
-  //
-  // Otherwise, the halfword is a 16-bit instruction.
-
-  // Read 2 bytes of data first.
-  uint8_t bytes[2];
-  if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1)
-    return false;
+static DecodeStatus DecodeLDRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
 
-  // Encoded as a small-endian 16-bit halfword in the stream.
-  insn = (bytes[1] << 8) | bytes[0];
-  unsigned bits15_11 = slice(insn, 15, 11);
-  bool IsThumb2 = false;
+static DecodeStatus DecodeLDRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+  if (Rm == 0xF) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
 
-  // 32-bit instructions if the bits [15:11] of the halfword matches
-  // { 0b11101 /* 0x1D */, 0b11110 /* 0x1E */, ob11111 /* 0x1F */ }.
-  if (bits15_11 == 0x1D || bits15_11 == 0x1E || bits15_11 == 0x1F) {
-    IsThumb2 = true;
-    if (Region.readBytes(Address + 2, 2, (uint8_t*)bytes, NULL) == -1)
-      return false;
-    // Encoded as a small-endian 16-bit halfword in the stream.
-    insn1 = (bytes[1] << 8) | bytes[0];
-    insn = (insn << 16 | insn1);
-  }
-
-  // The insn could potentially be bit-twiddled in order to be decoded as an ARM
-  // NEON/VFP opcode.  In such case, the modified insn is later disassembled as
-  // an ARM NEON/VFP instruction.
-  //
-  // This is a short term solution for lack of encoding bits specified for the
-  // Thumb2 NEON/VFP instructions.  The long term solution could be adding some
-  // infrastructure to have each instruction support more than one encodings.
-  // Which encoding is used would be based on which subtarget the compiler/
-  // disassembler is working with at the time.  This would allow the sharing of
-  // the NEON patterns between ARM and Thumb2, as well as potential greater
-  // sharing between the regular ARM instructions and the 32-bit wide Thumb2
-  // instructions as well.
-  unsigned Opcode = decodeThumbSideEffect(IsThumb2, insn);
-
-  ARMFormat Format = ARMFormats[Opcode];
-  Size = IsThumb2 ? 4 : 2;
-
-  DEBUG({
-      errs() << "Opcode=" << Opcode << " Name=" << ARMUtils::OpcodeName(Opcode)
-             << " Format=" << stringForARMFormat(Format) << '(' << (int)Format
-             << ")\n";
-      showBitVector(errs(), insn);
-    });
-
-  OwningPtr<ARMBasicMCBuilder> Builder(CreateMCBuilder(Opcode, Format));
-  if (!Builder)
-    return false;
 
-  Builder->SetSession(const_cast<Session *>(&SO));
+static DecodeStatus DecodeSTRPreImm(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
 
-  Builder->setupBuilderForSymbolicDisassembly(getLLVMOpInfoCallback(),
-                                              getDisInfoBlock(), getMCContext(),
-                                              Address);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
 
-  if (!Builder->Build(MI, insn))
-    return false;
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
 
-  return true;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeAddrModeImm12Operand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
 }
 
-// A8.6.50
-// Valid return values are {1, 2, 3, 4}, with 0 signifying an error condition.
-static unsigned short CountITSize(unsigned ITMask) {
-  // First count the trailing zeros of the IT mask.
-  unsigned TZ = CountTrailingZeros_32(ITMask);
-  if (TZ > 3) {
-    DEBUG(errs() << "Encoding error: IT Mask '0000'");
-    return 0;
+static DecodeStatus DecodeSTRPreReg(llvm::MCInst &Inst, unsigned Insn,
+                            uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction32(Insn, 0, 12);
+  imm |= fieldFromInstruction32(Insn, 16, 4) << 13;
+  imm |= fieldFromInstruction32(Insn, 23, 1) << 12;
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+
+  if (Rn == 0xF || Rn == Rt) S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSORegMemOperand(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVLD1LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 2) != 0)
+        align = 4;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
   }
-  return (4 - TZ);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
 }
 
-/// Init ITState.  Note that at least one bit is always 1 in mask.
-bool Session::InitIT(unsigned short bits7_0) {
-  ITCounter = CountITSize(slice(bits7_0, 3, 0));
-  if (ITCounter == 0)
-    return false;
+static DecodeStatus DecodeVST1LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
 
-  // A8.6.50 IT
-  unsigned short FirstCond = slice(bits7_0, 7, 4);
-  if (FirstCond == 0xF) {
-    DEBUG(errs() << "Encoding error: IT FirstCond '1111'");
-    return false;
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 6, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 2) != 0)
+        align = 4;
   }
-  if (FirstCond == 0xE && ITCounter != 1) {
-    DEBUG(errs() << "Encoding error: IT FirstCond '1110' && Mask != '1000'");
-    return false;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
   }
 
-  ITState = bits7_0;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
 
-  return true;
+  return S;
 }
 
-/// Update ITState if necessary.
-void Session::UpdateIT() {
-  assert(ITCounter);
-  --ITCounter;
-  if (ITCounter == 0)
-    ITState = 0;
-  else {
-    unsigned short NewITState4_0 = slice(ITState, 4, 0) << 1;
-    setSlice(ITState, 4, 0, NewITState4_0);
+
+static DecodeStatus DecodeVLD2LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction32(Insn, 5, 3);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
   }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
 }
 
-static MCDisassembler *createARMDisassembler(const Target &T) {
-  return new ARMDisassembler;
+static DecodeStatus DecodeVST2LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      index = fieldFromInstruction32(Insn, 5, 3);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 2;
+      break;
+    case 1:
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 5, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 4, 1) != 0)
+        align = 8;
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
 }
 
-static MCDisassembler *createThumbDisassembler(const Target &T) {
-  return new ThumbDisassembler;
+
+static DecodeStatus DecodeVLD3LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
 }
 
-extern "C" void LLVMInitializeARMDisassembler() {
-  // Register the disassembler.
-  TargetRegistry::RegisterMCDisassembler(TheARMTarget,
-                                         createARMDisassembler);
-  TargetRegistry::RegisterMCDisassembler(TheThumbTarget,
-                                         createThumbDisassembler);
+static DecodeStatus DecodeVST3LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        return MCDisassembler::Fail; // UNDEFINED
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
 }
 
-EDInstInfo *ARMDisassembler::getEDInfo() const {
-  return instInfoARM;
+
+static DecodeStatus DecodeVLD4LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        align = 4 << fieldFromInstruction32(Insn, 4, 2);
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+        return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
 }
 
-EDInstInfo *ThumbDisassembler::getEDInfo() const {
-  return instInfoARM;
+static DecodeStatus DecodeVST4LN(llvm::MCInst &Inst, unsigned Insn,
+                         uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction32(Insn, 0, 4);
+  unsigned Rd = fieldFromInstruction32(Insn, 12, 4);
+  Rd |= fieldFromInstruction32(Insn, 22, 1) << 4;
+  unsigned size = fieldFromInstruction32(Insn, 10, 2);
+
+  unsigned align = 0;
+  unsigned index = 0;
+  unsigned inc = 1;
+  switch (size) {
+    default:
+      return MCDisassembler::Fail;
+    case 0:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 4;
+      index = fieldFromInstruction32(Insn, 5, 3);
+      break;
+    case 1:
+      if (fieldFromInstruction32(Insn, 4, 1))
+        align = 8;
+      index = fieldFromInstruction32(Insn, 6, 2);
+      if (fieldFromInstruction32(Insn, 5, 1))
+        inc = 2;
+      break;
+    case 2:
+      if (fieldFromInstruction32(Insn, 4, 2))
+        align = 4 << fieldFromInstruction32(Insn, 4, 2);
+      index = fieldFromInstruction32(Insn, 7, 1);
+      if (fieldFromInstruction32(Insn, 6, 1))
+        inc = 2;
+      break;
+  }
+
+  if (Rm != 0xF) { // Writeback
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  }
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(align));
+  if (Rm != 0xF) {
+    if (Rm != 0xD) {
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+    } else
+      Inst.addOperand(MCOperand::CreateReg(0));
+  }
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+2*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Rd+3*inc, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(index));
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVSRR(llvm::MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rt  = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 16, 4);
+  unsigned Rm  = fieldFromInstruction32(Insn,  0, 4);
+  unsigned pred = fieldFromInstruction32(Insn, 28, 4);
+  Rm |= fieldFromInstruction32(Insn, 5, 1) << 4;
+
+  if (Rt == 0xF || Rt2 == 0xF || Rm == 0x1F)
+    S = MCDisassembler::SoftFail;
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt2 , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm  , Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeSPRRegisterClass(Inst, Rm+1, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeIT(llvm::MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned pred = fieldFromInstruction16(Insn, 4, 4);
+  // The InstPrinter needs to have the low bit of the predicate in
+  // the mask operand to be able to print it properly.
+  unsigned mask = fieldFromInstruction16(Insn, 0, 5);
+
+  if (pred == 0xF) {
+    pred = 0xE;
+    S = MCDisassembler::SoftFail;
+  }
+
+  if ((mask & 0xF) == 0) {
+    // Preserve the high bit of the mask, which is the low bit of
+    // the predicate.
+    mask &= 0x10;
+    mask |= 0x8;
+    S = MCDisassembler::SoftFail;
+  }
+
+  Inst.addOperand(MCOperand::CreateImm(pred));
+  Inst.addOperand(MCOperand::CreateImm(mask));
+  return S;
 }
+
+static DecodeStatus
+DecodeT2LDRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction32(Insn, 0, 8);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+  unsigned U = fieldFromInstruction32(Insn, 23, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+  if (Rt == Rt2)
+    Check(S, MCDisassembler::SoftFail);
+
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus
+DecodeT2STRDPreInstruction(llvm::MCInst &Inst, unsigned Insn,
+                           uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction32(Insn, 12, 4);
+  unsigned Rt2 = fieldFromInstruction32(Insn, 8, 4);
+  unsigned Rn = fieldFromInstruction32(Insn, 16, 4);
+  unsigned addr = fieldFromInstruction32(Insn, 0, 8);
+  unsigned W = fieldFromInstruction32(Insn, 21, 1);
+  unsigned U = fieldFromInstruction32(Insn, 23, 1);
+  unsigned P = fieldFromInstruction32(Insn, 24, 1);
+  bool writeback = (W == 1) | (P == 0);
+
+  addr |= (U << 8) | (Rn << 9);
+
+  if (writeback && (Rn == Rt || Rn == Rt2))
+    Check(S, MCDisassembler::SoftFail);
+
+  // Writeback operand
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // Rt2
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt2, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // addr
+  if (!Check(S, DecodeT2AddrModeImm8s4(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2Adr(llvm::MCInst &Inst, uint32_t Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned sign1 = fieldFromInstruction32(Insn, 21, 1);
+  unsigned sign2 = fieldFromInstruction32(Insn, 23, 1);
+  if (sign1 != sign2) return MCDisassembler::Fail;
+
+  unsigned Val = fieldFromInstruction32(Insn, 0, 8);
+  Val |= fieldFromInstruction32(Insn, 12, 3) << 8;
+  Val |= fieldFromInstruction32(Insn, 26, 1) << 11;
+  Val |= sign1 << 12;
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<13>(Val)));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeT2ShifterImmOperand(llvm::MCInst &Inst, uint32_t Val,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  // Shift of "asr #32" is not allowed in Thumb2 mode.
+  if (Val == 0x20) S = MCDisassembler::SoftFail;
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return S;
+}
+
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.h b/lib/Target/ARM/Disassembler/ARMDisassembler.h
deleted file mode 100644
index 0a74a38..0000000
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.h
+++ /dev/null
@@ -1,99 +0,0 @@
-//===- ARMDisassembler.h - Disassembler for ARM/Thumb ISA -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains the header for ARMDisassembler and ThumbDisassembler, both are
-// subclasses of MCDisassembler.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMDISASSEMBLER_H
-#define ARMDISASSEMBLER_H
-
-#include "llvm/MC/MCDisassembler.h"
-
-namespace llvm {
-  
-class MCInst;
-class MemoryObject;
-class raw_ostream;
-  
-struct EDInstInfo;
-  
-/// ARMDisassembler - ARM disassembler for all ARM platforms.
-class ARMDisassembler : public MCDisassembler {
-public:
-  /// Constructor     - Initializes the disassembler.
-  ///
-  ARMDisassembler() :
-    MCDisassembler() {
-  }
-
-  ~ARMDisassembler() {
-  }
-
-  /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
-                      uint64_t &size,
-                      const MemoryObject &region,
-                      uint64_t address,
-                      raw_ostream &vStream) const;
-  
-  /// getEDInfo - See MCDisassembler.
-  EDInstInfo *getEDInfo() const;
-private:
-};
-
-// Forward declaration.
-class ARMBasicMCBuilder;
-
-/// Session - Keep track of the IT Block progression.
-class Session {
-  friend class ARMBasicMCBuilder;
-public:
-  Session() : ITCounter(0), ITState(0) {}
-  ~Session() {}
-  /// InitIT - Initializes ITCounter/ITState.
-  bool InitIT(unsigned short bits7_0);
-  /// UpdateIT - Updates ITCounter/ITState as IT Block progresses.
-  void UpdateIT();
-
-private:
-  unsigned ITCounter; // Possible values: 0, 1, 2, 3, 4.
-  unsigned ITState;   // A2.5.2 Consists of IT[7:5] and IT[4:0] initially.
-};
-
-/// ThumbDisassembler - Thumb disassembler for all ARM platforms.
-class ThumbDisassembler : public MCDisassembler {
-public:
-  /// Constructor     - Initializes the disassembler.
-  ///
-  ThumbDisassembler() :
-    MCDisassembler(), SO() {
-  }
-
-  ~ThumbDisassembler() {
-  }
-
-  /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
-                      uint64_t &size,
-                      const MemoryObject &region,
-                      uint64_t address,
-                      raw_ostream &vStream) const;
-  
-  /// getEDInfo - See MCDisassembler.
-  EDInstInfo *getEDInfo() const;
-private:
-  Session SO;
-};
-
-} // namespace llvm
-  
-#endif
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
deleted file mode 100644
index d89c80a..0000000
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.cpp
+++ /dev/null
@@ -1,3818 +0,0 @@
-//===- ARMDisassemblerCore.cpp - ARM disassembler helpers -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains code to represent the core concepts of Builder and DisassembleFP
-// to solve the problem of disassembling an ARM instr.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm-disassembler"
-
-#include "ARMDisassemblerCore.h"
-#include "ARMAddressingModes.h"
-#include "ARMMCExpr.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-//#define DEBUG(X) do { X; } while (0)
-
-/// ARMGenInstrInfo.inc - ARMGenInstrInfo.inc contains the static const
-/// MCInstrDesc ARMInsts[] definition and the MCOperandInfo[]'s describing the
-/// operand info for each ARMInsts[i].
-///
-/// Together with an instruction's encoding format, we can take advantage of the
-/// NumOperands and the OpInfo fields of the target instruction description in
-/// the quest to build out the MCOperand list for an MCInst.
-///
-/// The general guideline is that with a known format, the number of dst and src
-/// operands are well-known.  The dst is built first, followed by the src
-/// operand(s).  The operands not yet used at this point are for the Implicit
-/// Uses and Defs by this instr.  For the Uses part, the pred:$p operand is
-/// defined with two components:
-///
-/// def pred { // Operand PredicateOperand
-///   ValueType Type = OtherVT;
-///   string PrintMethod = "printPredicateOperand";
-///   string AsmOperandLowerMethod = ?;
-///   dag MIOperandInfo = (ops i32imm, CCR);
-///   AsmOperandClass ParserMatchClass = ImmAsmOperand;
-///   dag DefaultOps = (ops (i32 14), (i32 zero_reg));
-/// }
-///
-/// which is manifested by the MCOperandInfo[] of:
-///
-/// { 0, 0|(1<<MCOI::Predicate), 0 },
-/// { ARM::CCRRegClassID, 0|(1<<MCOI::Predicate), 0 }
-///
-/// So the first predicate MCOperand corresponds to the immediate part of the
-/// ARM condition field (Inst{31-28}), and the second predicate MCOperand
-/// corresponds to a register kind of ARM::CPSR.
-///
-/// For the Defs part, in the simple case of only cc_out:$s, we have:
-///
-/// def cc_out { // Operand OptionalDefOperand
-///   ValueType Type = OtherVT;
-///   string PrintMethod = "printSBitModifierOperand";
-///   string AsmOperandLowerMethod = ?;
-///   dag MIOperandInfo = (ops CCR);
-///   AsmOperandClass ParserMatchClass = ImmAsmOperand;
-///   dag DefaultOps = (ops (i32 zero_reg));
-/// }
-///
-/// which is manifested by the one MCOperandInfo of:
-///
-/// { ARM::CCRRegClassID, 0|(1<<MCOI::OptionalDef), 0 }
-///
-
-namespace llvm {
-extern MCInstrDesc ARMInsts[];
-}
-
-using namespace llvm;
-
-const char *ARMUtils::OpcodeName(unsigned Opcode) {
-  return ARMInsts[Opcode].Name;
-}
-
-// Return the register enum Based on RegClass and the raw register number.
-// FIXME: Auto-gened?
-static unsigned
-getRegisterEnum(BO B, unsigned RegClassID, unsigned RawRegister) {
-  if (RegClassID == ARM::rGPRRegClassID) {
-    // Check for The register numbers 13 and 15 that are not permitted for many
-    // Thumb register specifiers.
-    if (RawRegister == 13 || RawRegister == 15) {
-      B->SetErr(-1);
-      return 0;
-    }
-    // For this purpose, we can treat rGPR as if it were GPR.
-    RegClassID = ARM::GPRRegClassID;
-  }
-
-  // See also decodeNEONRd(), decodeNEONRn(), decodeNEONRm().
-  // A7.3 register encoding
-  //     Qd -> bit[12] == 0
-  //     Qn -> bit[16] == 0
-  //     Qm -> bit[0]  == 0
-  //
-  // If one of these bits is 1, the instruction is UNDEFINED.
-  if (RegClassID == ARM::QPRRegClassID && slice(RawRegister, 0, 0) == 1) {
-    B->SetErr(-1);
-    return 0;
-  }
-  unsigned RegNum =
-    RegClassID == ARM::QPRRegClassID ? RawRegister >> 1 : RawRegister;
-
-  switch (RegNum) {
-  default:
-    break;
-  case 0:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R0;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D0;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q0;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S0;
-    }
-    break;
-  case 1:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R1;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D1;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q1;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S1;
-    }
-    break;
-  case 2:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R2;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D2;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q2;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S2;
-    }
-    break;
-  case 3:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R3;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D3;
-    case ARM::QPRRegClassID: case ARM::QPR_8RegClassID:
-    case ARM::QPR_VFP2RegClassID:
-      return ARM::Q3;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S3;
-    }
-    break;
-  case 4:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R4;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D4;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q4;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S4;
-    }
-    break;
-  case 5:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R5;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D5;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q5;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S5;
-    }
-    break;
-  case 6:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R6;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D6;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q6;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S6;
-    }
-    break;
-  case 7:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: case ARM::tGPRRegClassID: return ARM::R7;
-    case ARM::DPRRegClassID: case ARM::DPR_8RegClassID:
-    case ARM::DPR_VFP2RegClassID:
-      return ARM::D7;
-    case ARM::QPRRegClassID: case ARM::QPR_VFP2RegClassID: return ARM::Q7;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S7;
-    }
-    break;
-  case 8:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R8;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D8;
-    case ARM::QPRRegClassID: return ARM::Q8;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S8;
-    }
-    break;
-  case 9:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R9;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D9;
-    case ARM::QPRRegClassID: return ARM::Q9;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S9;
-    }
-    break;
-  case 10:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R10;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D10;
-    case ARM::QPRRegClassID: return ARM::Q10;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S10;
-    }
-    break;
-  case 11:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R11;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D11;
-    case ARM::QPRRegClassID: return ARM::Q11;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S11;
-    }
-    break;
-  case 12:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::R12;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D12;
-    case ARM::QPRRegClassID: return ARM::Q12;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S12;
-    }
-    break;
-  case 13:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::SP;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D13;
-    case ARM::QPRRegClassID: return ARM::Q13;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S13;
-    }
-    break;
-  case 14:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::LR;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D14;
-    case ARM::QPRRegClassID: return ARM::Q14;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S14;
-    }
-    break;
-  case 15:
-    switch (RegClassID) {
-    case ARM::GPRRegClassID: return ARM::PC;
-    case ARM::DPRRegClassID: case ARM::DPR_VFP2RegClassID: return ARM::D15;
-    case ARM::QPRRegClassID: return ARM::Q15;
-    case ARM::SPRRegClassID: case ARM::SPR_8RegClassID: return ARM::S15;
-    }
-    break;
-  case 16:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D16;
-    case ARM::SPRRegClassID: return ARM::S16;
-    }
-    break;
-  case 17:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D17;
-    case ARM::SPRRegClassID: return ARM::S17;
-    }
-    break;
-  case 18:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D18;
-    case ARM::SPRRegClassID: return ARM::S18;
-    }
-    break;
-  case 19:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D19;
-    case ARM::SPRRegClassID: return ARM::S19;
-    }
-    break;
-  case 20:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D20;
-    case ARM::SPRRegClassID: return ARM::S20;
-    }
-    break;
-  case 21:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D21;
-    case ARM::SPRRegClassID: return ARM::S21;
-    }
-    break;
-  case 22:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D22;
-    case ARM::SPRRegClassID: return ARM::S22;
-    }
-    break;
-  case 23:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D23;
-    case ARM::SPRRegClassID: return ARM::S23;
-    }
-    break;
-  case 24:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D24;
-    case ARM::SPRRegClassID: return ARM::S24;
-    }
-    break;
-  case 25:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D25;
-    case ARM::SPRRegClassID: return ARM::S25;
-    }
-    break;
-  case 26:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D26;
-    case ARM::SPRRegClassID: return ARM::S26;
-    }
-    break;
-  case 27:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D27;
-    case ARM::SPRRegClassID: return ARM::S27;
-    }
-    break;
-  case 28:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D28;
-    case ARM::SPRRegClassID: return ARM::S28;
-    }
-    break;
-  case 29:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D29;
-    case ARM::SPRRegClassID: return ARM::S29;
-    }
-    break;
-  case 30:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D30;
-    case ARM::SPRRegClassID: return ARM::S30;
-    }
-    break;
-  case 31:
-    switch (RegClassID) {
-    case ARM::DPRRegClassID: return ARM::D31;
-    case ARM::SPRRegClassID: return ARM::S31;
-    }
-    break;
-  }
-  DEBUG(errs() << "Invalid (RegClassID, RawRegister) combination\n");
-  // Encoding error.  Mark the builder with error code != 0.
-  B->SetErr(-1);
-  return 0;
-}
-
-///////////////////////////////
-//                           //
-//     Utility Functions     //
-//                           //
-///////////////////////////////
-
-// Extract/Decode Rd: Inst{15-12}.
-static inline unsigned decodeRd(uint32_t insn) {
-  return (insn >> ARMII::RegRdShift) & ARMII::GPRRegMask;
-}
-
-// Extract/Decode Rn: Inst{19-16}.
-static inline unsigned decodeRn(uint32_t insn) {
-  return (insn >> ARMII::RegRnShift) & ARMII::GPRRegMask;
-}
-
-// Extract/Decode Rm: Inst{3-0}.
-static inline unsigned decodeRm(uint32_t insn) {
-  return (insn & ARMII::GPRRegMask);
-}
-
-// Extract/Decode Rs: Inst{11-8}.
-static inline unsigned decodeRs(uint32_t insn) {
-  return (insn >> ARMII::RegRsShift) & ARMII::GPRRegMask;
-}
-
-static inline unsigned getCondField(uint32_t insn) {
-  return (insn >> ARMII::CondShift);
-}
-
-static inline unsigned getIBit(uint32_t insn) {
-  return (insn >> ARMII::I_BitShift) & 1;
-}
-
-static inline unsigned getAM3IBit(uint32_t insn) {
-  return (insn >> ARMII::AM3_I_BitShift) & 1;
-}
-
-static inline unsigned getPBit(uint32_t insn) {
-  return (insn >> ARMII::P_BitShift) & 1;
-}
-
-static inline unsigned getUBit(uint32_t insn) {
-  return (insn >> ARMII::U_BitShift) & 1;
-}
-
-static inline unsigned getPUBits(uint32_t insn) {
-  return (insn >> ARMII::U_BitShift) & 3;
-}
-
-static inline unsigned getSBit(uint32_t insn) {
-  return (insn >> ARMII::S_BitShift) & 1;
-}
-
-static inline unsigned getWBit(uint32_t insn) {
-  return (insn >> ARMII::W_BitShift) & 1;
-}
-
-static inline unsigned getDBit(uint32_t insn) {
-  return (insn >> ARMII::D_BitShift) & 1;
-}
-
-static inline unsigned getNBit(uint32_t insn) {
-  return (insn >> ARMII::N_BitShift) & 1;
-}
-
-static inline unsigned getMBit(uint32_t insn) {
-  return (insn >> ARMII::M_BitShift) & 1;
-}
-
-// See A8.4 Shifts applied to a register.
-//     A8.4.2 Register controlled shifts.
-//
-// getShiftOpcForBits - getShiftOpcForBits translates from the ARM encoding bits
-// into llvm enums for shift opcode.  The API clients should pass in the value
-// encoded with two bits, so the assert stays to signal a wrong API usage.
-//
-// A8-12: DecodeRegShift()
-static inline ARM_AM::ShiftOpc getShiftOpcForBits(unsigned bits) {
-  switch (bits) {
-  default: assert(0 && "No such value"); return ARM_AM::no_shift;
-  case 0:  return ARM_AM::lsl;
-  case 1:  return ARM_AM::lsr;
-  case 2:  return ARM_AM::asr;
-  case 3:  return ARM_AM::ror;
-  }
-}
-
-// See A8.4 Shifts applied to a register.
-//     A8.4.1 Constant shifts.
-//
-// getImmShiftSE - getImmShiftSE translates from the raw ShiftOpc and raw Imm5
-// encodings into the intended ShiftOpc and shift amount.
-//
-// A8-11: DecodeImmShift()
-static inline void getImmShiftSE(ARM_AM::ShiftOpc &ShOp, unsigned &ShImm) {
-  if (ShImm != 0)
-    return;
-  switch (ShOp) {
-  case ARM_AM::no_shift:
-  case ARM_AM::rrx:
-    break;
-  case ARM_AM::lsl:
-    ShOp = ARM_AM::no_shift;
-    break;
-  case ARM_AM::lsr:
-  case ARM_AM::asr:
-    ShImm = 32;
-    break;
-  case ARM_AM::ror:
-    ShOp = ARM_AM::rrx;
-    break;
-  }
-}
-
-// getAMSubModeForBits - getAMSubModeForBits translates from the ARM encoding
-// bits Inst{24-23} (P(24) and U(23)) into llvm enums for AMSubMode.  The API
-// clients should pass in the value encoded with two bits, so the assert stays
-// to signal a wrong API usage.
-static inline ARM_AM::AMSubMode getAMSubModeForBits(unsigned bits) {
-  switch (bits) {
-  default: assert(0 && "No such value"); return ARM_AM::bad_am_submode;
-  case 1:  return ARM_AM::ia;   // P=0 U=1
-  case 3:  return ARM_AM::ib;   // P=1 U=1
-  case 0:  return ARM_AM::da;   // P=0 U=0
-  case 2:  return ARM_AM::db;   // P=1 U=0
-  }
-}
-
-////////////////////////////////////////////
-//                                        //
-//    Disassemble function definitions    //
-//                                        //
-////////////////////////////////////////////
-
-/// There is a separate Disassemble*Frm function entry for disassembly of an ARM
-/// instr into a list of MCOperands in the appropriate order, with possible dst,
-/// followed by possible src(s).
-///
-/// The processing of the predicate, and the 'S' modifier bit, if MI modifies
-/// the CPSR, is factored into ARMBasicMCBuilder's method named
-/// TryPredicateAndSBitModifier.
-
-static bool DisassemblePseudo(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  assert(0 && "Unexpected pseudo instruction!");
-  return false;
-}
-
-// A8.6.94 MLA
-// if d == 15 || n == 15 || m == 15 || a == 15 then UNPREDICTABLE;
-//
-// A8.6.105 MUL
-// if d == 15 || n == 15 || m == 15 then UNPREDICTABLE;
-//
-// A8.6.246 UMULL
-// if dLo == 15 || dHi == 15 || n == 15 || m == 15 then UNPREDICTABLE;
-// if dHi == dLo then UNPREDICTABLE;
-static bool BadRegsMulFrm(unsigned Opcode, uint32_t insn) {
-  unsigned R19_16 = slice(insn, 19, 16);
-  unsigned R15_12 = slice(insn, 15, 12);
-  unsigned R11_8  = slice(insn, 11, 8);
-  unsigned R3_0   = slice(insn, 3, 0);
-  switch (Opcode) {
-  default:
-    // Did we miss an opcode?
-    DEBUG(errs() << "BadRegsMulFrm: unexpected opcode!");
-    return false;
-  case ARM::MLA:     case ARM::MLS:     case ARM::SMLABB:  case ARM::SMLABT:
-  case ARM::SMLATB:  case ARM::SMLATT:  case ARM::SMLAWB:  case ARM::SMLAWT:
-  case ARM::SMMLA:   case ARM::SMMLAR:  case ARM::SMMLS:   case ARM::SMMLSR:
-  case ARM::USADA8:
-    if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15)
-      return true;
-    return false;
-  case ARM::MUL:     case ARM::SMMUL:   case ARM::SMMULR:
-  case ARM::SMULBB:  case ARM::SMULBT:  case ARM::SMULTB:  case ARM::SMULTT:
-  case ARM::SMULWB:  case ARM::SMULWT:  case ARM::SMUAD:   case ARM::SMUADX:
-  // A8.6.167 SMLAD & A8.6.172 SMLSD
-  case ARM::SMLAD:   case ARM::SMLADX:  case ARM::SMLSD:   case ARM::SMLSDX:
-  case ARM::USAD8:
-    if (R19_16 == 15 || R11_8 == 15 || R3_0 == 15)
-      return true;
-    return false;
-  case ARM::SMLAL:   case ARM::SMULL:   case ARM::UMAAL:   case ARM::UMLAL:
-  case ARM::UMULL:
-  case ARM::SMLALBB: case ARM::SMLALBT: case ARM::SMLALTB: case ARM::SMLALTT:
-  case ARM::SMLALD:  case ARM::SMLALDX: case ARM::SMLSLD:  case ARM::SMLSLDX:
-    if (R19_16 == 15 || R15_12 == 15 || R11_8 == 15 || R3_0 == 15)
-      return true;
-    if (R19_16 == R15_12)
-      return true;
-    return false;;
-  }
-}
-
-// Multiply Instructions.
-// MLA, MLS, SMLABB, SMLABT, SMLATB, SMLATT, SMLAWB, SMLAWT, SMMLA, SMMLAR,
-// SMMLS, SMMLAR, SMLAD, SMLADX, SMLSD, SMLSDX, and USADA8 (for convenience):
-//     Rd{19-16} Rn{3-0} Rm{11-8} Ra{15-12}
-// But note that register checking for {SMLAD, SMLADX, SMLSD, SMLSDX} is
-// only for {d, n, m}.
-//
-// MUL, SMMUL, SMMULR, SMULBB, SMULBT, SMULTB, SMULTT, SMULWB, SMULWT, SMUAD,
-// SMUADX, and USAD8 (for convenience):
-//     Rd{19-16} Rn{3-0} Rm{11-8}
-//
-// SMLAL, SMULL, UMAAL, UMLAL, UMULL, SMLALBB, SMLALBT, SMLALTB, SMLALTT,
-// SMLALD, SMLADLX, SMLSLD, SMLSLDX:
-//     RdLo{15-12} RdHi{19-16} Rn{3-0} Rm{11-8}
-//
-// The mapping of the multiply registers to the "regular" ARM registers, where
-// there are convenience decoder functions, is:
-//
-// Inst{15-12} => Rd
-// Inst{19-16} => Rn
-// Inst{3-0} => Rm
-// Inst{11-8} => Rs
-static bool DisassembleMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  unsigned short NumDefs = MCID.getNumDefs();
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumDefs > 0 && "NumDefs should be greater than 0 for MulFrm");
-  assert(NumOps >= 3
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && OpInfo[2].RegClass == ARM::GPRRegClassID
-         && "Expect three register operands");
-
-  // Sanity check for the register encodings.
-  if (BadRegsMulFrm(Opcode, insn))
-    return false;
-
-  // Instructions with two destination registers have RdLo{15-12} first.
-  if (NumDefs == 2) {
-    assert(NumOps >= 4 && OpInfo[3].RegClass == ARM::GPRRegClassID &&
-           "Expect 4th register operand");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  // The destination register: RdHi{19-16} or Rd{19-16}.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-
-  // The two src regsiters: Rn{3-0}, then Rm{11-8}.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRs(insn))));
-  OpIdx += 3;
-
-  // Many multiply instructions (e.g., MLA) have three src registers.
-  // The third register operand is Ra{15-12}.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// Helper routines for disassembly of coprocessor instructions.
-
-static bool LdStCopOpcode(unsigned Opcode) {
-  if ((Opcode >= ARM::LDC2L_OFFSET && Opcode <= ARM::LDC_PRE) ||
-      (Opcode >= ARM::STC2L_OFFSET && Opcode <= ARM::STC_PRE))
-    return true;
-  return false;
-}
-static bool CoprocessorOpcode(unsigned Opcode) {
-  if (LdStCopOpcode(Opcode))
-    return true;
-
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::CDP:  case ARM::CDP2:
-  case ARM::MCR:  case ARM::MCR2:  case ARM::MRC:  case ARM::MRC2:
-  case ARM::MCRR: case ARM::MCRR2: case ARM::MRRC: case ARM::MRRC2:
-    return true;
-  }
-}
-static inline unsigned GetCoprocessor(uint32_t insn) {
-  return slice(insn, 11, 8);
-}
-static inline unsigned GetCopOpc1(uint32_t insn, bool CDP) {
-  return CDP ? slice(insn, 23, 20) : slice(insn, 23, 21);
-}
-static inline unsigned GetCopOpc2(uint32_t insn) {
-  return slice(insn, 7, 5);
-}
-static inline unsigned GetCopOpc(uint32_t insn) {
-  return slice(insn, 7, 4);
-}
-// Most of the operands are in immediate forms, except Rd and Rn, which are ARM
-// core registers.
-//
-// CDP, CDP2:                cop opc1 CRd CRn CRm opc2
-//
-// MCR, MCR2, MRC, MRC2:     cop opc1 Rd CRn CRm opc2
-//
-// MCRR, MCRR2, MRRC, MRRc2: cop opc Rd Rn CRm
-//
-// LDC_OFFSET, LDC_PRE, LDC_POST: cop CRd Rn R0 [+/-]imm8:00
-// and friends
-// STC_OFFSET, STC_PRE, STC_POST: cop CRd Rn R0 [+/-]imm8:00
-// and friends
-//                                        <-- addrmode2 -->
-//
-// LDC_OPTION:                    cop CRd Rn imm8
-// and friends
-// STC_OPTION:                    cop CRd Rn imm8
-// and friends
-//
-static bool DisassembleCoprocessor(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 4 && "Num of operands >= 4 for coprocessor instr");
-
-  unsigned &OpIdx = NumOpsAdded;
-  // A8.6.92
-  // if coproc == '101x' then SEE "Advanced SIMD and VFP"
-  // But since the special instructions have more explicit encoding bits
-  // specified, if coproc == 10 or 11, we should reject it as invalid.
-  unsigned coproc = GetCoprocessor(insn);
-  if ((Opcode == ARM::MCR || Opcode == ARM::MCRR ||
-       Opcode == ARM::MRC || Opcode == ARM::MRRC) &&
-      (coproc == 10 || coproc == 11)) {
-    DEBUG(errs() << "Encoding error: coproc == 10 or 11 for MCR[R]/MR[R]C\n");
-    return false;
-  }
-
-  bool OneCopOpc = (Opcode == ARM::MCRR || Opcode == ARM::MCRR2 ||
-                    Opcode == ARM::MRRC || Opcode == ARM::MRRC2);
-
-  // CDP/CDP2 has no GPR operand; the opc1 operand is also wider (Inst{23-20}).
-  bool NoGPR = (Opcode == ARM::CDP || Opcode == ARM::CDP2);
-  bool LdStCop = LdStCopOpcode(Opcode);
-  bool RtOut = (Opcode == ARM::MRC || Opcode == ARM::MRC2);
-
-  OpIdx = 0;
-
-  if (RtOut) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-  MI.addOperand(MCOperand::CreateImm(coproc));
-  ++OpIdx;
-
-  if (LdStCop) {
-    // Unindex if P:W = 0b00 --> _OPTION variant
-    unsigned PW = getPBit(insn) << 1 | getWBit(insn);
-
-    MI.addOperand(MCOperand::CreateImm(decodeRd(insn)));
-
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    OpIdx += 2;
-
-    if (PW) {
-      MI.addOperand(MCOperand::CreateReg(0));
-      ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-      const MCInstrDesc &MCID = ARMInsts[Opcode];
-      unsigned IndexMode =
-                 (MCID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-      unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, slice(insn, 7, 0) << 2,
-                                          ARM_AM::no_shift, IndexMode);
-      MI.addOperand(MCOperand::CreateImm(Offset));
-      OpIdx += 2;
-    } else {
-      MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 0)));
-      ++OpIdx;
-    }
-  } else {
-    MI.addOperand(MCOperand::CreateImm(OneCopOpc ? GetCopOpc(insn)
-                                                 : GetCopOpc1(insn, NoGPR)));
-    ++OpIdx;
-
-    if (!RtOut) {
-      MI.addOperand(NoGPR ? MCOperand::CreateImm(decodeRd(insn))
-                          : MCOperand::CreateReg(
-                                getRegisterEnum(B, ARM::GPRRegClassID,
-                                                decodeRd(insn))));
-      ++OpIdx;
-    }
-
-    MI.addOperand(OneCopOpc ? MCOperand::CreateReg(
-                                getRegisterEnum(B, ARM::GPRRegClassID,
-                                                decodeRn(insn)))
-                            : MCOperand::CreateImm(decodeRn(insn)));
-
-    MI.addOperand(MCOperand::CreateImm(decodeRm(insn)));
-
-    OpIdx += 2;
-
-    if (!OneCopOpc) {
-      MI.addOperand(MCOperand::CreateImm(GetCopOpc2(insn)));
-      ++OpIdx;
-    }
-  }
-
-  return true;
-}
-
-// Branch Instructions.
-// BL: SignExtend(Imm24:'00', 32)
-// Bcc, BL_pred: SignExtend(Imm24:'00', 32) Pred0 Pred1
-// SMC: ZeroExtend(imm4, 32)
-// SVC: ZeroExtend(Imm24, 32)
-//
-// Various coprocessor instructions are assigned BrFrm arbitrarily.
-// Delegates to DisassembleCoprocessor() helper function.
-//
-// MRS/MRSsys: Rd
-// MSR/MSRsys: Rm mask=Inst{19-16}
-// BXJ:        Rm
-// MSRi/MSRsysi: so_imm
-// SRSW/SRS: ldstm_mode:$amode mode_imm
-// RFEW/RFE: ldstm_mode:$amode Rn
-static bool DisassembleBrFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (CoprocessorOpcode(Opcode))
-    return DisassembleCoprocessor(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  // MRS and MRSsys take one GPR reg Rd.
-  if (Opcode == ARM::MRS || Opcode == ARM::MRSsys) {
-    assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // BXJ takes one GPR reg Rm.
-  if (Opcode == ARM::BXJ) {
-    assert(NumOps >= 1 && OpInfo[0].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // MSR take a mask, followed by one GPR reg Rm. The mask contains the R Bit in
-  // bit 4, and the special register fields in bits 3-0.
-  if (Opcode == ARM::MSR) {
-    assert(NumOps >= 1 && OpInfo[1].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ |
-                                       slice(insn, 19, 16) /* Special Reg */ ));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    NumOpsAdded = 2;
-    return true;
-  }
-  // MSRi take a mask, followed by one so_imm operand. The mask contains the
-  // R Bit in bit 4, and the special register fields in bits 3-0.
-  if (Opcode == ARM::MSRi) {
-    // A5.2.11 MSR (immediate), and hints & B6.1.6 MSR (immediate)
-    // The hints instructions have more specific encodings, so if mask == 0,
-    // we should reject this as an invalid instruction.
-    if (slice(insn, 19, 16) == 0)
-      return false;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 22, 22) << 4 /* R Bit */ |
-                                       slice(insn, 19, 16) /* Special Reg */ ));
-    // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0.
-    // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}.
-    // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot().
-    unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF;
-    unsigned Imm = insn & 0xFF;
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot)));
-    NumOpsAdded = 2;
-    return true;
-  }
-  if (Opcode == ARM::SRSW || Opcode == ARM::SRS ||
-      Opcode == ARM::RFEW || Opcode == ARM::RFE) {
-    ARM_AM::AMSubMode SubMode = getAMSubModeForBits(getPUBits(insn));
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM4ModeImm(SubMode)));
-
-    if (Opcode == ARM::SRSW || Opcode == ARM::SRS)
-      MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));
-    else
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         decodeRn(insn))));
-    NumOpsAdded = 3;
-    return true;
-  }
-
-  assert((Opcode == ARM::Bcc || Opcode == ARM::BL || Opcode == ARM::BL_pred
-          || Opcode == ARM::SMC || Opcode == ARM::SVC) &&
-         "Unexpected Opcode");
-
-  assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected");
-
-  int Imm32 = 0;
-  if (Opcode == ARM::SMC) {
-    // ZeroExtend(imm4, 32) where imm24 = Inst{3-0}.
-    Imm32 = slice(insn, 3, 0);
-  } else if (Opcode == ARM::SVC) {
-    // ZeroExtend(imm24, 32) where imm24 = Inst{23-0}.
-    Imm32 = slice(insn, 23, 0);
-  } else {
-    // SignExtend(imm24:'00', 32) where imm24 = Inst{23-0}.
-    unsigned Imm26 = slice(insn, 23, 0) << 2;
-    //Imm32 = signextend<signed int, 26>(Imm26);
-    Imm32 = SignExtend32<26>(Imm26);
-  }
-
-  MI.addOperand(MCOperand::CreateImm(Imm32));
-  NumOpsAdded = 1;
-
-  return true;
-}
-
-// Misc. Branch Instructions.
-// BX_RET, MOVPCLR
-// BLX, BLX_pred, BX, BX_pred
-// BLXi
-static bool DisassembleBrMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // BX_RET and MOVPCLR have only two predicate operands; do an early return.
-  if (Opcode == ARM::BX_RET || Opcode == ARM::MOVPCLR)
-    return true;
-
-  // BLX and BX take one GPR reg.
-  if (Opcode == ARM::BLX || Opcode == ARM::BLX_pred ||
-      Opcode == ARM::BX || Opcode == ARM::BX_pred) {
-    assert(NumOps >= 1 && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    OpIdx = 1;
-    return true;
-  }
-
-  // BLXi takes imm32 (the PC offset).
-  if (Opcode == ARM::BLXi) {
-    assert(NumOps >= 1 && OpInfo[0].RegClass < 0 && "Imm operand expected");
-    // SignExtend(imm24:H:'0', 32) where imm24 = Inst{23-0} and H = Inst{24}.
-    unsigned Imm26 = slice(insn, 23, 0) << 2 | slice(insn, 24, 24) << 1;
-    int Imm32 = SignExtend32<26>(Imm26);
-    MI.addOperand(MCOperand::CreateImm(Imm32));
-    OpIdx = 1;
-    return true;
-  }
-
-  return false;
-}
-
-static inline bool getBFCInvMask(uint32_t insn, uint32_t &mask) {
-  uint32_t lsb = slice(insn, 11, 7);
-  uint32_t msb = slice(insn, 20, 16);
-  uint32_t Val = 0;
-  if (msb < lsb) {
-    DEBUG(errs() << "Encoding error: msb < lsb\n");
-    return false;
-  }
-
-  for (uint32_t i = lsb; i <= msb; ++i)
-    Val |= (1 << i);
-  mask = ~Val;
-  return true;
-}
-
-// Standard data-processing instructions allow PC as a register specifier,
-// but we should reject other DPFrm instructions with PC as registers.
-static bool BadRegsDPFrm(unsigned Opcode, uint32_t insn) {
-  switch (Opcode) {
-  default:
-    // Did we miss an opcode?
-    if (decodeRd(insn) == 15 || decodeRn(insn) == 15 || decodeRm(insn) == 15) {
-      DEBUG(errs() << "DPFrm with bad reg specifier(s)\n");
-      return true;
-    }
-  case ARM::ADCrr:  case ARM::ADDSrr: case ARM::ADDrr:  case ARM::ANDrr:
-  case ARM::BICrr:  case ARM::CMNzrr: case ARM::CMPrr:  case ARM::EORrr:
-  case ARM::ORRrr:  case ARM::RSBrr:  case ARM::RSCrr:  case ARM::SBCrr:
-  case ARM::SUBSrr: case ARM::SUBrr:  case ARM::TEQrr:  case ARM::TSTrr:
-    return false;
-  }
-}
-
-// A major complication is the fact that some of the saturating add/subtract
-// operations have Rd Rm Rn, instead of the "normal" Rd Rn Rm.
-// They are QADD, QDADD, QDSUB, and QSUB.
-static bool DisassembleDPFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  unsigned short NumDefs = MCID.getNumDefs();
-  bool isUnary = isUnaryDP(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Disassemble register def if there is one.
-  if (NumDefs && (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  // Now disassemble the src operands.
-  if (OpIdx >= NumOps)
-    return false;
-
-  // Special-case handling of BFC/BFI/SBFX/UBFX.
-  if (Opcode == ARM::BFC || Opcode == ARM::BFI) {
-    // A8.6.17 BFC & A8.6.18 BFI
-    // Sanity check Rd.
-    if (decodeRd(insn) == 15)
-      return false;
-    MI.addOperand(MCOperand::CreateReg(0));
-    if (Opcode == ARM::BFI) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         decodeRm(insn))));
-      ++OpIdx;
-    }
-    uint32_t mask = 0;
-    if (!getBFCInvMask(insn, mask))
-      return false;
-
-    MI.addOperand(MCOperand::CreateImm(mask));
-    OpIdx += 2;
-    return true;
-  }
-  if (Opcode == ARM::SBFX || Opcode == ARM::UBFX) {
-    // Sanity check Rd and Rm.
-    if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-      return false;
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 11, 7)));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 20, 16) + 1));
-    OpIdx += 3;
-    return true;
-  }
-
-  bool RmRn = (Opcode == ARM::QADD || Opcode == ARM::QDADD ||
-               Opcode == ARM::QDSUB || Opcode == ARM::QSUB);
-
-  // BinaryDP has an Rn operand.
-  if (!isUnary) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::GPRRegClassID,
-                                    RmRn ? decodeRm(insn) : decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // If this is a two-address operand, skip it, e.g., MOVCCr operand 1.
-  if (isUnary && (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)) {
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Now disassemble operand 2.
-  if (OpIdx >= NumOps)
-    return false;
-
-  if (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) {
-    // We have a reg/reg form.
-    // Assert disabled because saturating operations, e.g., A8.6.127 QASX, are
-    // routed here as well.
-    // assert(getIBit(insn) == 0 && "I_Bit != '0' reg/reg form");
-    if (BadRegsDPFrm(Opcode, insn))
-      return false;
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::GPRRegClassID,
-                                    RmRn? decodeRn(insn) : decodeRm(insn))));
-    ++OpIdx;
-  } else if (Opcode == ARM::MOVi16 || Opcode == ARM::MOVTi16) {
-    // These two instructions don't allow d as 15.
-    if (decodeRd(insn) == 15)
-      return false;
-    // We have an imm16 = imm4:imm12 (imm4=Inst{19:16}, imm12 = Inst{11:0}).
-    assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form");
-    unsigned Imm16 = slice(insn, 19, 16) << 12 | slice(insn, 11, 0);
-    if (!B->tryAddingSymbolicOperand(Imm16, 4, MI))
-      MI.addOperand(MCOperand::CreateImm(Imm16));
-    ++OpIdx;
-  } else {
-    // We have a reg/imm form.
-    // SOImm is 4-bit rotate amount in bits 11-8 with 8-bit imm in bits 7-0.
-    // A5.2.4 Rotate amount is twice the numeric value of Inst{11-8}.
-    // See also ARMAddressingModes.h: getSOImmValImm() and getSOImmValRot().
-    assert(getIBit(insn) == 1 && "I_Bit != '1' reg/imm form");
-    unsigned Rot = (insn >> ARMII::SoRotImmShift) & 0xF;
-    unsigned Imm = insn & 0xFF;
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::rotr32(Imm, 2*Rot)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool DisassembleDPSoRegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  unsigned short NumDefs = MCID.getNumDefs();
-  bool isUnary = isUnaryDP(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Disassemble register def if there is one.
-  if (NumDefs && (OpInfo[OpIdx].RegClass == ARM::GPRRegClassID)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the src operands.
-  if (OpIdx >= NumOps)
-    return false;
-
-  // BinaryDP has an Rn operand.
-  if (!isUnary) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // If this is a two-address operand, skip it, e.g., MOVCCs operand 1.
-  if (isUnary && (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)) {
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Disassemble operand 2, which consists of three components.
-  if (OpIdx + 2 >= NumOps)
-    return false;
-
-  assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+2].RegClass < 0) &&
-         "Expect 3 reg operands");
-
-  // Register-controlled shifts have Inst{7} = 0 and Inst{4} = 1.
-  unsigned Rs = slice(insn, 4, 4);
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  if (Rs) {
-    // If Inst{7} != 0, we should reject this insn as an invalid encoding.
-    if (slice(insn, 7, 7))
-      return false;
-
-    // A8.6.3 ADC (register-shifted register)
-    // if d == 15 || n == 15 || m == 15 || s == 15 then UNPREDICTABLE;
-    // 
-    // This also accounts for shift instructions (register) where, fortunately,
-    // Inst{19-16} = 0b0000.
-    // A8.6.89 LSL (register)
-    // if d == 15 || n == 15 || m == 15 then UNPREDICTABLE;
-    if (decodeRd(insn) == 15 || decodeRn(insn) == 15 ||
-        decodeRm(insn) == 15 || decodeRs(insn) == 15)
-      return false;
-    
-    // Register-controlled shifts: [Rm, Rs, shift].
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRs(insn))));
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, 0)));
-  } else {
-    // Constant shifts: [Rm, reg0, shift_imm].
-    MI.addOperand(MCOperand::CreateReg(0)); // NoRegister
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShImm = slice(insn, 11, 7);
-
-    // A8.4.1.  Possible rrx or shift amount of 32...
-    getImmShiftSE(ShOp, ShImm);
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShImm)));
-  }
-  OpIdx += 3;
-
-  return true;
-}
-
-static bool BadRegsLdStFrm(unsigned Opcode, uint32_t insn, bool Store, bool WBack,
-                           bool Imm) {
-  const StringRef Name = ARMInsts[Opcode].Name;
-  unsigned Rt = decodeRd(insn);
-  unsigned Rn = decodeRn(insn);
-  unsigned Rm = decodeRm(insn);
-  unsigned P  = getPBit(insn);
-  unsigned W  = getWBit(insn);
-
-  if (Store) {
-    // Only STR (immediate, register) allows PC as the source.
-    if (Name.startswith("STRB") && Rt == 15) {
-      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-      return true;
-    }
-    if (WBack && (Rn == 15 || Rn == Rt)) {
-      DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
-      return true;
-    }
-    if (!Imm && Rm == 15) {
-      DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
-      return true;
-    }
-  } else {
-    // Only LDR (immediate, register) allows PC as the destination.
-    if (Name.startswith("LDRB") && Rt == 15) {
-      DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-      return true;
-    }
-    if (Imm) {
-      // Immediate
-      if (Rn == 15) {
-        // The literal form must be in offset mode; it's an encoding error
-        // otherwise.
-        if (!(P == 1 && W == 0)) {
-          DEBUG(errs() << "Ld literal form with !(P == 1 && W == 0)\n");
-          return true;
-        }
-        // LDRB (literal) does not allow PC as the destination.
-        if (Opcode != ARM::LDRi12 && Rt == 15) {
-          DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-          return true;
-        }
-      } else {
-        // Write back while Rn == Rt does not make sense.
-        if (WBack && (Rn == Rt)) {
-          DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
-          return true;
-        }
-      }
-    } else {
-      // Register
-      if (Rm == 15) {
-        DEBUG(errs() << "if m == 15 then UNPREDICTABLE\n");
-        return true;
-      }
-      if (WBack && (Rn == 15 || Rn == Rt)) {
-        DEBUG(errs() << "if wback && (n == 15 || n == t) then UNPREDICTABLE\n");
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-static bool DisassembleLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  bool isPrePost = isPrePostLdSt(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(((!isStore && MCID.getNumDefs() > 0) ||
-          (isStore && (MCID.getNumDefs() == 0 || isPrePost)))
-         && "Invalid arguments");
-
-  // Operand 0 of a pre- and post-indexed store is the address base writeback.
-  if (isPrePost && isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the dst/src operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  // After dst of a pre- and post-indexed load is the address base writeback.
-  if (isPrePost && !isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the base operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  assert((!isPrePost || (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1))
-         && "Index mode or tied_to operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  // For reg/reg form, base reg is followed by +/- reg shop imm.
-  // For immediate form, it is followed by +/- imm12.
-  // See also ARMAddressingModes.h (Addressing Mode #2).
-  if (OpIdx + 1 >= NumOps)
-    return false;
-
-  if (BadRegsLdStFrm(Opcode, insn, isStore, isPrePost, getIBit(insn)==0))
-    return false;
-
-  ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-  unsigned IndexMode =
-               (MCID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-  if (getIBit(insn) == 0) {
-    // For pre- and post-indexed case, add a reg0 operand (Addressing Mode #2).
-    // Otherwise, skip the reg operand since for addrmode_imm12, Rn has already
-    // been populated.
-    if (isPrePost) {
-      MI.addOperand(MCOperand::CreateReg(0));
-      OpIdx += 1;
-    }
-
-    unsigned Imm12 = slice(insn, 11, 0);
-    if (Opcode == ARM::LDRBi12 || Opcode == ARM::LDRi12 ||
-        Opcode == ARM::STRBi12 || Opcode == ARM::STRi12) {
-      // Disassemble the 12-bit immediate offset, which is the second operand in
-      // $addrmode_imm12 => (ops GPR:$base, i32imm:$offsimm).    
-      int Offset = AddrOpcode == ARM_AM::add ? 1 * Imm12 : -1 * Imm12;
-      MI.addOperand(MCOperand::CreateImm(Offset));
-    } else {
-      // Disassemble the 12-bit immediate offset, which is the second operand in
-      // $am2offset => (ops GPR, i32imm).
-      unsigned Offset = ARM_AM::getAM2Opc(AddrOpcode, Imm12, ARM_AM::no_shift,
-                                          IndexMode);
-      MI.addOperand(MCOperand::CreateImm(Offset));
-    }
-    OpIdx += 1;
-  } else {
-    // If Inst{25} = 1 and Inst{4} != 0, we should reject this as invalid.
-    if (slice(insn,4,4) == 1)
-      return false;
-
-    // Disassemble the offset reg (Rm), shift type, and immediate shift length.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShImm = slice(insn, 11, 7);
-
-    // A8.4.1.  Possible rrx or shift amount of 32...
-    getImmShiftSE(ShOp, ShImm);
-    MI.addOperand(MCOperand::CreateImm(
-                    ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp, IndexMode)));
-    OpIdx += 2;
-  }
-
-  return true;
-}
-
-static bool DisassembleLdFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStFrm(MI, Opcode, insn, NumOps, NumOpsAdded, false, B);
-}
-
-static bool DisassembleStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStFrm(MI, Opcode, insn, NumOps, NumOpsAdded, true, B);
-}
-
-static bool HasDualReg(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::LDRD: case ARM::LDRD_PRE: case ARM::LDRD_POST:
-  case ARM::STRD: case ARM::STRD_PRE: case ARM::STRD_POST:
-    return true;
-  }
-}
-
-static bool DisassembleLdStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool isStore, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  bool isPrePost = isPrePostLdSt(MCID.TSFlags);
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(((!isStore && MCID.getNumDefs() > 0) ||
-          (isStore && (MCID.getNumDefs() == 0 || isPrePost)))
-         && "Invalid arguments");
-
-  // Operand 0 of a pre- and post-indexed store is the address base writeback.
-  if (isPrePost && isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the dst/src operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  // Fill in LDRD and STRD's second operand Rt operand.
-  if (HasDualReg(Opcode)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn) + 1)));
-    ++OpIdx;
-  }
-
-  // After dst of a pre- and post-indexed load is the address base writeback.
-  if (isPrePost && !isStore) {
-    assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  // Disassemble the base operand.
-  if (OpIdx >= NumOps)
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-         "Reg operand expected");
-  assert((!isPrePost || (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1))
-         && "Offset mode or tied_to operand expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  // For reg/reg form, base reg is followed by +/- reg.
-  // For immediate form, it is followed by +/- imm8.
-  // See also ARMAddressingModes.h (Addressing Mode #3).
-  if (OpIdx + 1 >= NumOps)
-    return false;
-
-  assert((OpInfo[OpIdx].RegClass == ARM::GPRRegClassID) &&
-         (OpInfo[OpIdx+1].RegClass < 0) &&
-         "Expect 1 reg operand followed by 1 imm operand");
-
-  ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-  unsigned IndexMode =
-                 (MCID.TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-  if (getAM3IBit(insn) == 1) {
-    MI.addOperand(MCOperand::CreateReg(0));
-
-    // Disassemble the 8-bit immediate offset.
-    unsigned Imm4H = (insn >> ARMII::ImmHiShift) & 0xF;
-    unsigned Imm4L = insn & 0xF;
-    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, (Imm4H << 4) | Imm4L,
-                                        IndexMode);
-    MI.addOperand(MCOperand::CreateImm(Offset));
-  } else {
-    // Disassemble the offset reg (Rm).
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    unsigned Offset = ARM_AM::getAM3Opc(AddrOpcode, 0, IndexMode);
-    MI.addOperand(MCOperand::CreateImm(Offset));
-  }
-  OpIdx += 2;
-
-  return true;
-}
-
-static bool DisassembleLdMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStMiscFrm(MI, Opcode, insn, NumOps, NumOpsAdded, false,
-                                B);
-}
-
-static bool DisassembleStMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleLdStMiscFrm(MI, Opcode, insn, NumOps, NumOpsAdded, true, B);
-}
-
-// The algorithm for disassembly of LdStMulFrm is different from others because
-// it explicitly populates the two predicate operands after the base register.
-// After that, we need to populate the reglist with each affected register
-// encoded as an MCOperand.
-static bool DisassembleLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 4 && "LdStMulFrm expects NumOps >= 4");
-  NumOpsAdded = 0;
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-
-  // Writeback to base, if necessary.
-  if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::STMIA_UPD ||
-      Opcode == ARM::LDMDA_UPD || Opcode == ARM::STMDA_UPD ||
-      Opcode == ARM::LDMDB_UPD || Opcode == ARM::STMDB_UPD ||
-      Opcode == ARM::LDMIB_UPD || Opcode == ARM::STMIB_UPD) {
-    MI.addOperand(MCOperand::CreateReg(Base));
-    ++NumOpsAdded;
-  }
-
-  // Add the base register operand.
-  MI.addOperand(MCOperand::CreateReg(Base));
-
-  // Handling the two predicate operands before the reglist.
-  int64_t CondVal = getCondField(insn);
-  if (CondVal == 0xF)
-    return false;
-  MI.addOperand(MCOperand::CreateImm(CondVal));
-  MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-
-  NumOpsAdded += 3;
-
-  // Fill the variadic part of reglist.
-  unsigned RegListBits = insn & ((1 << 16) - 1);
-  for (unsigned i = 0; i < 16; ++i) {
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         i)));
-      ++NumOpsAdded;
-    }
-  }
-
-  return true;
-}
-
-// LDREX, LDREXB, LDREXH: Rd Rn
-// LDREXD:                Rd Rd+1 Rn
-// STREX, STREXB, STREXH: Rd Rm Rn
-// STREXD:                Rd Rm Rm+1 Rn
-//
-// SWP, SWPB:             Rd Rm Rn
-static bool DisassembleLdStExFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && "Expect 2 reg operands");
-
-  bool isStore = slice(insn, 20, 20) == 0;
-  bool isDW = (Opcode == ARM::LDREXD || Opcode == ARM::STREXD);
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  // Store register Exclusive needs a source operand.
-  if (isStore) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    ++OpIdx;
-
-    if (isDW) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         decodeRm(insn)+1)));
-      ++OpIdx;
-    }
-  } else if (isDW) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRd(insn)+1)));
-    ++OpIdx;
-  }
-
-  // Finally add the pointer operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  return true;
-}
-
-// Misc. Arithmetic Instructions.
-// CLZ: Rd Rm
-// PKHBT, PKHTB: Rd Rn Rm , LSL/ASR #imm5
-// RBIT, REV, REV16, REVSH: Rd Rm
-static bool DisassembleArithMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && "Expect 2 reg operands");
-
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID;
-
-  // Sanity check the registers, which should not be 15.
-  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-    return false;
-  if (ThreeReg && decodeRn(insn) == 15)
-    return false;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    assert(NumOps >= 4 && "Expect >= 4 operands");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  // If there is still an operand info left which is an immediate operand, add
-  // an additional imm5 LSL/ASR operand.
-  if (ThreeReg && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Extract the 5-bit immediate field Inst{11-7}.
-    unsigned ShiftAmt = (insn >> ARMII::ShiftShift) & 0x1F;
-    ARM_AM::ShiftOpc Opc = ARM_AM::no_shift;
-    if (Opcode == ARM::PKHBT)
-      Opc = ARM_AM::lsl;
-    else if (Opcode == ARM::PKHTB)
-      Opc = ARM_AM::asr;
-    getImmShiftSE(Opc, ShiftAmt);
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShiftAmt)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-/// DisassembleSatFrm - Disassemble saturate instructions:
-/// SSAT, SSAT16, USAT, and USAT16.
-static bool DisassembleSatFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // A8.6.183 SSAT
-  // if d == 15 || n == 15 then UNPREDICTABLE;
-  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-    return false;
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  NumOpsAdded = MCID.getNumOperands() - 2; // ignore predicate operands
-
-  // Disassemble register def.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  unsigned Pos = slice(insn, 20, 16);
-  if (Opcode == ARM::SSAT || Opcode == ARM::SSAT16)
-    Pos += 1;
-  MI.addOperand(MCOperand::CreateImm(Pos));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-
-  if (NumOpsAdded == 4) {
-    ARM_AM::ShiftOpc Opc = (slice(insn, 6, 6) != 0 ? ARM_AM::asr : ARM_AM::lsl);
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShAmt = slice(insn, 11, 7);
-    if (ShAmt == 0) {
-      // A8.6.183.  Possible ASR shift amount of 32...
-      if (Opc == ARM_AM::asr)
-        ShAmt = 32;
-      else
-        Opc = ARM_AM::no_shift;
-    }
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShAmt)));
-  }
-  return true;
-}
-
-// Extend instructions.
-// SXT* and UXT*: Rd [Rn] Rm [rot_imm].
-// The 2nd operand register is Rn and the 3rd operand regsiter is Rm for the
-// three register operand form.  Otherwise, Rn=0b1111 and only Rm is used.
-static bool DisassembleExtFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // A8.6.220 SXTAB
-  // if d == 15 || m == 15 then UNPREDICTABLE;
-  if (decodeRd(insn) == 15 || decodeRm(insn) == 15)
-    return false;
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::GPRRegClassID
-         && OpInfo[1].RegClass == ARM::GPRRegClassID
-         && "Expect 2 reg operands");
-
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass == ARM::GPRRegClassID;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  // If there is still an operand info left which is an immediate operand, add
-  // an additional rotate immediate operand.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Extract the 2-bit rotate field Inst{11-10}.
-    unsigned rot = (insn >> ARMII::ExtRotImmShift) & 3;
-    // Rotation by 8, 16, or 24 bits.
-    MI.addOperand(MCOperand::CreateImm(rot << 3));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-/////////////////////////////////////
-//                                 //
-//    Utility Functions For VFP    //
-//                                 //
-/////////////////////////////////////
-
-// Extract/Decode Dd/Sd:
-//
-// SP => d = UInt(Vd:D)
-// DP => d = UInt(D:Vd)
-static unsigned decodeVFPRd(uint32_t insn, bool isSPVFP) {
-  return isSPVFP ? (decodeRd(insn) << 1 | getDBit(insn))
-                 : (decodeRd(insn) | getDBit(insn) << 4);
-}
-
-// Extract/Decode Dn/Sn:
-//
-// SP => n = UInt(Vn:N)
-// DP => n = UInt(N:Vn)
-static unsigned decodeVFPRn(uint32_t insn, bool isSPVFP) {
-  return isSPVFP ? (decodeRn(insn) << 1 | getNBit(insn))
-                 : (decodeRn(insn) | getNBit(insn) << 4);
-}
-
-// Extract/Decode Dm/Sm:
-//
-// SP => m = UInt(Vm:M)
-// DP => m = UInt(M:Vm)
-static unsigned decodeVFPRm(uint32_t insn, bool isSPVFP) {
-  return isSPVFP ? (decodeRm(insn) << 1 | getMBit(insn))
-                 : (decodeRm(insn) | getMBit(insn) << 4);
-}
-
-// A7.5.1
-static APInt VFPExpandImm(unsigned char byte, unsigned N) {
-  assert(N == 32 || N == 64);
-
-  uint64_t Result;
-  unsigned bit6 = slice(byte, 6, 6);
-  if (N == 32) {
-    Result = slice(byte, 7, 7) << 31 | slice(byte, 5, 0) << 19;
-    if (bit6)
-      Result |= 0x1f << 25;
-    else
-      Result |= 0x1 << 30;
-  } else {
-    Result = (uint64_t)slice(byte, 7, 7) << 63 |
-             (uint64_t)slice(byte, 5, 0) << 48;
-    if (bit6)
-      Result |= 0xffULL << 54;
-    else
-      Result |= 0x1ULL << 62;
-  }
-  return APInt(N, Result);
-}
-
-// VFP Unary Format Instructions:
-//
-// VCMP[E]ZD, VCMP[E]ZS: compares one floating-point register with zero
-// VCVTDS, VCVTSD: converts between double-precision and single-precision
-// The rest of the instructions have homogeneous [VFP]Rd and [VFP]Rm registers.
-static bool DisassembleVFPUnaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 1 && "VFPUnaryFrm expects NumOps >= 1");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RegClass = OpInfo[OpIdx].RegClass;
-  assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) &&
-         "Reg operand expected");
-  bool isSP = (RegClass == ARM::SPRRegClassID);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRd(insn, isSP))));
-  ++OpIdx;
-
-  // Early return for compare with zero instructions.
-  if (Opcode == ARM::VCMPEZD || Opcode == ARM::VCMPEZS
-      || Opcode == ARM::VCMPZD || Opcode == ARM::VCMPZS)
-    return true;
-
-  RegClass = OpInfo[OpIdx].RegClass;
-  assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) &&
-         "Reg operand expected");
-  isSP = (RegClass == ARM::SPRRegClassID);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRm(insn, isSP))));
-  ++OpIdx;
-
-  return true;
-}
-
-// All the instructions have homogeneous [VFP]Rd, [VFP]Rn, and [VFP]Rm regs.
-// Some of them have operand constraints which tie the first operand in the
-// InOperandList to that of the dst.  As far as asm printing is concerned, this
-// tied_to operand is simply skipped.
-static bool DisassembleVFPBinaryFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPBinaryFrm expects NumOps >= 3");
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RegClass = OpInfo[OpIdx].RegClass;
-  assert((RegClass == ARM::SPRRegClassID || RegClass == ARM::DPRRegClassID) &&
-         "Reg operand expected");
-  bool isSP = (RegClass == ARM::SPRRegClassID);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRd(insn, isSP))));
-  ++OpIdx;
-
-  // Skip tied_to operand constraint.
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    assert(NumOps >= 4 && "Expect >=4 operands");
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRn(insn, isSP))));
-  ++OpIdx;
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass, decodeVFPRm(insn, isSP))));
-  ++OpIdx;
-
-  return true;
-}
-
-// A8.6.295 vcvt (floating-point <-> integer)
-// Int to FP: VSITOD, VSITOS, VUITOD, VUITOS
-// FP to Int: VTOSI[Z|R]D, VTOSI[Z|R]S, VTOUI[Z|R]D, VTOUI[Z|R]S
-//
-// A8.6.297 vcvt (floating-point and fixed-point)
-// Dd|Sd Dd|Sd(TIED_TO) #fbits(= 16|32 - UInt(imm4:i))
-static bool DisassembleVFPConv1Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "VFPConv1Frm expects NumOps >= 2");
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  bool SP = slice(insn, 8, 8) == 0; // A8.6.295 & A8.6.297
-  bool fixed_point = slice(insn, 17, 17) == 1; // A8.6.297
-  unsigned RegClassID = SP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
-
-  if (fixed_point) {
-    // A8.6.297
-    assert(NumOps >= 3 && "Expect >= 3 operands");
-    int size = slice(insn, 7, 7) == 0 ? 16 : 32;
-    int fbits = size - (slice(insn,3,0) << 1 | slice(insn,5,5));
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, RegClassID,
-                                    decodeVFPRd(insn, SP))));
-
-    assert(MCID.getOperandConstraint(1, MCOI::TIED_TO) != -1 &&
-           "Tied to operand expected");
-    MI.addOperand(MI.getOperand(0));
-
-    assert(OpInfo[2].RegClass < 0 && !OpInfo[2].isPredicate() &&
-           !OpInfo[2].isOptionalDef() && "Imm operand expected");
-    MI.addOperand(MCOperand::CreateImm(fbits));
-
-    NumOpsAdded = 3;
-  } else {
-    // A8.6.295
-    // The Rd (destination) and Rm (source) bits have different interpretations
-    // depending on their single-precisonness.
-    unsigned d, m;
-    if (slice(insn, 18, 18) == 1) { // to_integer operation
-      d = decodeVFPRd(insn, true /* Is Single Precision */);
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, ARM::SPRRegClassID, d)));
-      m = decodeVFPRm(insn, SP);
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, m)));
-    } else {
-      d = decodeVFPRd(insn, SP);
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, d)));
-      m = decodeVFPRm(insn, true /* Is Single Precision */);
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, ARM::SPRRegClassID, m)));
-    }
-    NumOpsAdded = 2;
-  }
-
-  return true;
-}
-
-// VMOVRS - A8.6.330
-// Rt => Rd; Sn => UInt(Vn:N)
-static bool DisassembleVFPConv2Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "VFPConv2Frm expects NumOps >= 2");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                     decodeVFPRn(insn, true))));
-  NumOpsAdded = 2;
-  return true;
-}
-
-// VMOVRRD - A8.6.332
-// Rt => Rd; Rt2 => Rn; Dm => UInt(M:Vm)
-//
-// VMOVRRS - A8.6.331
-// Rt => Rd; Rt2 => Rn; Sm => UInt(Vm:M); Sm1 = Sm+1
-static bool DisassembleVFPConv3Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPConv3Frm expects NumOps >= 3");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  OpIdx = 2;
-
-  if (OpInfo[OpIdx].RegClass == ARM::SPRRegClassID) {
-    unsigned Sm = decodeVFPRm(insn, true);
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm)));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm+1)));
-    OpIdx += 2;
-  } else {
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::DPRRegClassID,
-                                    decodeVFPRm(insn, false))));
-    ++OpIdx;
-  }
-  return true;
-}
-
-// VMOVSR - A8.6.330
-// Rt => Rd; Sn => UInt(Vn:N)
-static bool DisassembleVFPConv4Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "VFPConv4Frm expects NumOps >= 2");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                     decodeVFPRn(insn, true))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  NumOpsAdded = 2;
-  return true;
-}
-
-// VMOVDRR - A8.6.332
-// Rt => Rd; Rt2 => Rn; Dm => UInt(M:Vm)
-//
-// VMOVRRS - A8.6.331
-// Rt => Rd; Rt2 => Rn; Sm => UInt(Vm:M); Sm1 = Sm+1
-static bool DisassembleVFPConv5Frm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPConv5Frm expects NumOps >= 3");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  if (OpInfo[OpIdx].RegClass == ARM::SPRRegClassID) {
-    unsigned Sm = decodeVFPRm(insn, true);
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm)));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::SPRRegClassID,
-                                                       Sm+1)));
-    OpIdx += 2;
-  } else {
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::DPRRegClassID,
-                                    decodeVFPRm(insn, false))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  OpIdx += 2;
-  return true;
-}
-
-// VFP Load/Store Instructions.
-// VLDRD, VLDRS, VSTRD, VSTRS
-static bool DisassembleVFPLdStFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 3 && "VFPLdStFrm expects NumOps >= 3");
-
-  bool isSPVFP = (Opcode == ARM::VLDRS || Opcode == ARM::VSTRS);
-  unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
-
-  // Extract Dd/Sd for operand 0.
-  unsigned RegD = decodeVFPRd(insn, isSPVFP);
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID, RegD)));
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-  MI.addOperand(MCOperand::CreateReg(Base));
-
-  // Next comes the AM5 Opcode.
-  ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-  unsigned char Imm8 = insn & 0xFF;
-  MI.addOperand(MCOperand::CreateImm(ARM_AM::getAM5Opc(AddrOpcode, Imm8)));
-
-  NumOpsAdded = 3;
-
-  return true;
-}
-
-// VFP Load/Store Multiple Instructions.
-// We have an optional write back reg, the base, and two predicate operands.
-// It is then followed by a reglist of either DPR(s) or SPR(s).
-//
-// VLDMD[_UPD], VLDMS[_UPD], VSTMD[_UPD], VSTMS[_UPD]
-static bool DisassembleVFPLdStMulFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 4 && "VFPLdStMulFrm expects NumOps >= 4");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-
-  // Writeback to base, if necessary.
-  if (Opcode == ARM::VLDMDIA_UPD || Opcode == ARM::VLDMSIA_UPD ||
-      Opcode == ARM::VLDMDDB_UPD || Opcode == ARM::VLDMSDB_UPD ||
-      Opcode == ARM::VSTMDIA_UPD || Opcode == ARM::VSTMSIA_UPD ||
-      Opcode == ARM::VSTMDDB_UPD || Opcode == ARM::VSTMSDB_UPD) {
-    MI.addOperand(MCOperand::CreateReg(Base));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(Base));
-
-  // Handling the two predicate operands before the reglist.
-  int64_t CondVal = getCondField(insn);
-  if (CondVal == 0xF)
-    return false;
-  MI.addOperand(MCOperand::CreateImm(CondVal));
-  MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-
-  OpIdx += 3;
-
-  bool isSPVFP = (Opcode == ARM::VLDMSIA     ||
-                  Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMSDB_UPD ||
-                  Opcode == ARM::VSTMSIA     ||
-                  Opcode == ARM::VSTMSIA_UPD || Opcode == ARM::VSTMSDB_UPD);
-  unsigned RegClassID = isSPVFP ? ARM::SPRRegClassID : ARM::DPRRegClassID;
-
-  // Extract Dd/Sd.
-  unsigned RegD = decodeVFPRd(insn, isSPVFP);
-
-  // Fill the variadic part of reglist.
-  unsigned char Imm8 = insn & 0xFF;
-  unsigned Regs = isSPVFP ? Imm8 : Imm8/2;
-
-  // Apply some sanity checks before proceeding.
-  if (Regs == 0 || (RegD + Regs) > 32 || (!isSPVFP && Regs > 16))
-    return false;
-
-  for (unsigned i = 0; i < Regs; ++i) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassID,
-                                                       RegD + i)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// Misc. VFP Instructions.
-// FMSTAT (vmrs with Rt=0b1111, i.e., to apsr_nzcv and no register operand)
-// FCONSTD (DPR and a VFPf64Imm operand)
-// FCONSTS (SPR and a VFPf32Imm operand)
-// VMRS/VMSR (GPR operand)
-static bool DisassembleVFPMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  if (Opcode == ARM::FMSTAT)
-    return true;
-
-  assert(NumOps >= 2 && "VFPMiscFrm expects >=2 operands");
-
-  unsigned RegEnum = 0;
-  switch (OpInfo[0].RegClass) {
-  case ARM::DPRRegClassID:
-    RegEnum = getRegisterEnum(B, ARM::DPRRegClassID, decodeVFPRd(insn, false));
-    break;
-  case ARM::SPRRegClassID:
-    RegEnum = getRegisterEnum(B, ARM::SPRRegClassID, decodeVFPRd(insn, true));
-    break;
-  case ARM::GPRRegClassID:
-    RegEnum = getRegisterEnum(B, ARM::GPRRegClassID, decodeRd(insn));
-    break;
-  default:
-    assert(0 && "Invalid reg class id");
-    return false;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(RegEnum));
-  ++OpIdx;
-
-  // Extract/decode the f64/f32 immediate.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-        && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // The asm syntax specifies the floating point value, not the 8-bit literal.
-    APInt immRaw = VFPExpandImm(slice(insn,19,16) << 4 | slice(insn, 3, 0),
-                             Opcode == ARM::FCONSTD ? 64 : 32);
-    APFloat immFP = APFloat(immRaw, true);
-    double imm = Opcode == ARM::FCONSTD ? immFP.convertToDouble() :
-      immFP.convertToFloat();
-    MI.addOperand(MCOperand::CreateFPImm(imm));
-
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// DisassembleThumbFrm() is defined in ThumbDisassemblerCore.h file.
-#include "ThumbDisassemblerCore.h"
-
-/////////////////////////////////////////////////////
-//                                                 //
-//     Utility Functions For ARM Advanced SIMD     //
-//                                                 //
-/////////////////////////////////////////////////////
-
-// The following NEON namings are based on A8.6.266 VABA, VABAL.  Notice that
-// A8.6.303 VDUP (ARM core register)'s D/Vd pair is the N/Vn pair of VABA/VABAL.
-
-// A7.3 Register encoding
-
-// Extract/Decode NEON D/Vd:
-//
-// Note that for quadword, Qd = UInt(D:Vd<3:1>) = Inst{22:15-13}, whereas for
-// doubleword, Dd = UInt(D:Vd).  We compensate for this difference by
-// handling it in the getRegisterEnum() utility function.
-// D = Inst{22}, Vd = Inst{15-12}
-static unsigned decodeNEONRd(uint32_t insn) {
-  return ((insn >> ARMII::NEON_D_BitShift) & 1) << 4
-    | ((insn >> ARMII::NEON_RegRdShift) & ARMII::NEONRegMask);
-}
-
-// Extract/Decode NEON N/Vn:
-//
-// Note that for quadword, Qn = UInt(N:Vn<3:1>) = Inst{7:19-17}, whereas for
-// doubleword, Dn = UInt(N:Vn).  We compensate for this difference by
-// handling it in the getRegisterEnum() utility function.
-// N = Inst{7}, Vn = Inst{19-16}
-static unsigned decodeNEONRn(uint32_t insn) {
-  return ((insn >> ARMII::NEON_N_BitShift) & 1) << 4
-    | ((insn >> ARMII::NEON_RegRnShift) & ARMII::NEONRegMask);
-}
-
-// Extract/Decode NEON M/Vm:
-//
-// Note that for quadword, Qm = UInt(M:Vm<3:1>) = Inst{5:3-1}, whereas for
-// doubleword, Dm = UInt(M:Vm).  We compensate for this difference by
-// handling it in the getRegisterEnum() utility function.
-// M = Inst{5}, Vm = Inst{3-0}
-static unsigned decodeNEONRm(uint32_t insn) {
-  return ((insn >> ARMII::NEON_M_BitShift) & 1) << 4
-    | ((insn >> ARMII::NEON_RegRmShift) & ARMII::NEONRegMask);
-}
-
-namespace {
-enum ElemSize {
-  ESizeNA = 0,
-  ESize8 = 8,
-  ESize16 = 16,
-  ESize32 = 32,
-  ESize64 = 64
-};
-} // End of unnamed namespace
-
-// size        field -> Inst{11-10}
-// index_align field -> Inst{7-4}
-//
-// The Lane Index interpretation depends on the Data Size:
-//   8  (encoded as size = 0b00) -> Index = index_align[3:1]
-//   16 (encoded as size = 0b01) -> Index = index_align[3:2]
-//   32 (encoded as size = 0b10) -> Index = index_align[3]
-//
-// Ref: A8.6.317 VLD4 (single 4-element structure to one lane).
-static unsigned decodeLaneIndex(uint32_t insn) {
-  unsigned size = insn >> 10 & 3;
-  assert((size == 0 || size == 1 || size == 2) &&
-         "Encoding error: size should be either 0, 1, or 2");
-
-  unsigned index_align = insn >> 4 & 0xF;
-  return (index_align >> 1) >> size;
-}
-
-// imm64 = AdvSIMDExpandImm(op, cmode, i:imm3:imm4)
-// op = Inst{5}, cmode = Inst{11-8}
-// i = Inst{24} (ARM architecture)
-// imm3 = Inst{18-16}, imm4 = Inst{3-0}
-// Ref: Table A7-15 Modified immediate values for Advanced SIMD instructions.
-static uint64_t decodeN1VImm(uint32_t insn, ElemSize esize) {
-  unsigned char op = (insn >> 5) & 1;
-  unsigned char cmode = (insn >> 8) & 0xF;
-  unsigned char Imm8 = ((insn >> 24) & 1) << 7 |
-                       ((insn >> 16) & 7) << 4 |
-                       (insn & 0xF);
-  return (op << 12) | (cmode << 8) | Imm8;
-}
-
-// A8.6.339 VMUL, VMULL (by scalar)
-// ESize16 => m = Inst{2-0} (Vm<2:0>) D0-D7
-// ESize32 => m = Inst{3-0} (Vm<3:0>) D0-D15
-static unsigned decodeRestrictedDm(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize16:
-    return insn & 7;
-  case ESize32:
-    return insn & 0xF;
-  default:
-    assert(0 && "Unreachable code!");
-    return 0;
-  }
-}
-
-// A8.6.339 VMUL, VMULL (by scalar)
-// ESize16 => index = Inst{5:3} (M:Vm<3>) D0-D7
-// ESize32 => index = Inst{5}   (M)       D0-D15
-static unsigned decodeRestrictedDmIndex(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize16:
-    return (((insn >> 5) & 1) << 1) | ((insn >> 3) & 1);
-  case ESize32:
-    return (insn >> 5) & 1;
-  default:
-    assert(0 && "Unreachable code!");
-    return 0;
-  }
-}
-
-// A8.6.296 VCVT (between floating-point and fixed-point, Advanced SIMD)
-// (64 - <fbits>) is encoded as imm6, i.e., Inst{21-16}.
-static unsigned decodeVCVTFractionBits(uint32_t insn) {
-  return 64 - ((insn >> 16) & 0x3F);
-}
-
-// A8.6.302 VDUP (scalar)
-// ESize8  => index = Inst{19-17}
-// ESize16 => index = Inst{19-18}
-// ESize32 => index = Inst{19}
-static unsigned decodeNVLaneDupIndex(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize8:
-    return (insn >> 17) & 7;
-  case ESize16:
-    return (insn >> 18) & 3;
-  case ESize32:
-    return (insn >> 19) & 1;
-  default:
-    assert(0 && "Unspecified element size!");
-    return 0;
-  }
-}
-
-// A8.6.328 VMOV (ARM core register to scalar)
-// A8.6.329 VMOV (scalar to ARM core register)
-// ESize8  => index = Inst{21:6-5}
-// ESize16 => index = Inst{21:6}
-// ESize32 => index = Inst{21}
-static unsigned decodeNVLaneOpIndex(uint32_t insn, ElemSize esize) {
-  switch (esize) {
-  case ESize8:
-    return ((insn >> 21) & 1) << 2 | ((insn >> 5) & 3);
-  case ESize16:
-    return ((insn >> 21) & 1) << 1 | ((insn >> 6) & 1);
-  case ESize32:
-    return ((insn >> 21) & 1);
-  default:
-    assert(0 && "Unspecified element size!");
-    return 0;
-  }
-}
-
-// Imm6 = Inst{21-16}, L = Inst{7}
-//
-// LeftShift == true (A8.6.367 VQSHL, A8.6.387 VSLI):
-// case L:imm6 of
-//   '0001xxx' => esize = 8; shift_amount = imm6 - 8
-//   '001xxxx' => esize = 16; shift_amount = imm6 - 16
-//   '01xxxxx' => esize = 32; shift_amount = imm6 - 32
-//   '1xxxxxx' => esize = 64; shift_amount = imm6
-//
-// LeftShift == false (A8.6.376 VRSHR, A8.6.368 VQSHRN):
-// case L:imm6 of
-//   '0001xxx' => esize = 8; shift_amount = 16 - imm6
-//   '001xxxx' => esize = 16; shift_amount = 32 - imm6
-//   '01xxxxx' => esize = 32; shift_amount = 64 - imm6
-//   '1xxxxxx' => esize = 64; shift_amount = 64 - imm6
-//
-static unsigned decodeNVSAmt(uint32_t insn, bool LeftShift) {
-  ElemSize esize = ESizeNA;
-  unsigned L = (insn >> 7) & 1;
-  unsigned imm6 = (insn >> 16) & 0x3F;
-  if (L == 0) {
-    if (imm6 >> 3 == 1)
-      esize = ESize8;
-    else if (imm6 >> 4 == 1)
-      esize = ESize16;
-    else if (imm6 >> 5 == 1)
-      esize = ESize32;
-    else
-      assert(0 && "Wrong encoding of Inst{7:21-16}!");
-  } else
-    esize = ESize64;
-
-  if (LeftShift)
-    return esize == ESize64 ? imm6 : (imm6 - esize);
-  else
-    return esize == ESize64 ? (esize - imm6) : (2*esize - imm6);
-}
-
-// A8.6.305 VEXT
-// Imm4 = Inst{11-8}
-static unsigned decodeN3VImm(uint32_t insn) {
-  return (insn >> 8) & 0xF;
-}
-
-// VLD*
-//   D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm]
-// VLD*LN*
-//   D[d] D[d2] ... Rn [TIED_TO Rn] align [Rm] TIED_TO ... imm(idx)
-// VST*
-//   Rn [TIED_TO Rn] align [Rm] D[d] D[d2] ...
-// VST*LN*
-//   Rn [TIED_TO Rn] align [Rm] D[d] D[d2] ... [imm(idx)]
-//
-// Correctly set VLD*/VST*'s TIED_TO GPR, as the asm printer needs it.
-static bool DisassembleNLdSt0(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool Store, bool DblSpaced,
-    unsigned alignment, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  // At least one DPR register plus addressing mode #6.
-  assert(NumOps >= 3 && "Expect >= 3 operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // We have homogeneous NEON registers for Load/Store.
-  unsigned RegClass = 0;
-
-  // Double-spaced registers have increments of 2.
-  unsigned Inc = DblSpaced ? 2 : 1;
-
-  unsigned Rn = decodeRn(insn);
-  unsigned Rm = decodeRm(insn);
-  unsigned Rd = decodeNEONRd(insn);
-
-  // A7.7.1 Advanced SIMD addressing mode.
-  bool WB = Rm != 15;
-
-  // LLVM Addressing Mode #6.
-  unsigned RmEnum = 0;
-  if (WB && Rm != 13)
-    RmEnum = getRegisterEnum(B, ARM::GPRRegClassID, Rm);
-
-  if (Store) {
-    // Consume possible WB, AddrMode6, possible increment reg, the DPR/QPR's,
-    // then possible lane index.
-    assert(OpIdx < NumOps && OpInfo[0].RegClass == ARM::GPRRegClassID &&
-           "Reg operand expected");
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         Rn)));
-      ++OpIdx;
-    }
-
-    assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
-    // addrmode6 := (ops GPR:$addr, i32imm)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       Rn)));
-    MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment
-    OpIdx += 2;
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(RmEnum));
-      ++OpIdx;
-    }
-
-    assert(OpIdx < NumOps &&
-           (OpInfo[OpIdx].RegClass == ARM::DPRRegClassID ||
-            OpInfo[OpIdx].RegClass == ARM::QPRRegClassID) &&
-           "Reg operand expected");
-
-    RegClass = OpInfo[OpIdx].RegClass;
-    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd)));
-      Rd += Inc;
-      ++OpIdx;
-    }
-
-    // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-        && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-      MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
-      ++OpIdx;
-    }
-
-  } else {
-    // Consume the DPR/QPR's, possible WB, AddrMode6, possible incrment reg,
-    // possible TIED_TO DPR/QPR's (ignored), then possible lane index.
-    RegClass = OpInfo[0].RegClass;
-
-    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
-      MI.addOperand(MCOperand::CreateReg(
-                      getRegisterEnum(B, RegClass, Rd)));
-      Rd += Inc;
-      ++OpIdx;
-    }
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         Rn)));
-      ++OpIdx;
-    }
-
-    assert((OpIdx+1) < NumOps && OpInfo[OpIdx].RegClass == ARM::GPRRegClassID &&
-           OpInfo[OpIdx + 1].RegClass < 0 && "Addrmode #6 Operands expected");
-    // addrmode6 := (ops GPR:$addr, i32imm)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       Rn)));
-    MI.addOperand(MCOperand::CreateImm(alignment)); // Alignment
-    OpIdx += 2;
-
-    if (WB) {
-      MI.addOperand(MCOperand::CreateReg(RmEnum));
-      ++OpIdx;
-    }
-
-    while (OpIdx < NumOps && (unsigned)OpInfo[OpIdx].RegClass == RegClass) {
-      assert(MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1 &&
-             "Tied to operand expected");
-      MI.addOperand(MCOperand::CreateReg(0));
-      ++OpIdx;
-    }
-
-    // Handle possible lane index.
-    if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-        && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-      MI.addOperand(MCOperand::CreateImm(decodeLaneIndex(insn)));
-      ++OpIdx;
-    }
-  }
-
-  // Accessing registers past the end of the NEON register file is not
-  // defined.
-  if (Rd > 32)
-    return false;
-
-  return true;
-}
-
-// A8.6.308, A8.6.311, A8.6.314, A8.6.317.
-static bool Align4OneLaneInst(unsigned elem, unsigned size,
-    unsigned index_align, unsigned & alignment) {
-  unsigned bits = 0;
-  switch (elem) {
-  default:
-    return false;
-  case 1:
-    // A8.6.308
-    if (size == 0)
-      return slice(index_align, 0, 0) == 0;
-    else if (size == 1) {
-      bits = slice(index_align, 1, 0);
-      if (bits != 0 && bits != 1)
-        return false;
-      if (bits == 1)
-        alignment = 16;
-      return true;
-    } else if (size == 2) {
-      bits = slice(index_align, 2, 0);
-      if (bits != 0 && bits != 3)
-        return false;
-      if (bits == 3)
-        alignment = 32;
-      return true;;
-    }
-    return true;
-  case 2:
-    // A8.6.311
-    if (size == 0) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 16;
-      return true;
-    } if (size == 1) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 32;
-      return true;
-    } else if (size == 2) {
-      if (slice(index_align, 1, 1) != 0)
-        return false;
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 64;
-      return true;;
-    }
-    return true;
-  case 3:
-    // A8.6.314
-    if (size == 0) {
-      if (slice(index_align, 0, 0) != 0)
-        return false;
-      return true;
-    } if (size == 1) {
-      if (slice(index_align, 0, 0) != 0)
-        return false;
-      return true;
-      return true;
-    } else if (size == 2) {
-      if (slice(index_align, 1, 0) != 0)
-        return false;
-      return true;;
-    }
-    return true;
-  case 4:
-    // A8.6.317
-    if (size == 0) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 32;
-      return true;
-    } if (size == 1) {
-      if (slice(index_align, 0, 0) == 1)
-        alignment = 64;
-      return true;
-    } else if (size == 2) {
-      bits = slice(index_align, 1, 0);
-      if (bits == 3)
-        return false;
-      if (bits == 1)
-        alignment = 64;
-      else if (bits == 2)
-        alignment = 128;
-      return true;;
-    }
-    return true;
-  }
-}
-
-// A7.7
-// If L (Inst{21}) == 0, store instructions.
-// Find out about double-spaced-ness of the Opcode and pass it on to
-// DisassembleNLdSt0().
-static bool DisassembleNLdSt(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const StringRef Name = ARMInsts[Opcode].Name;
-  bool DblSpaced = false;
-  // 0 represents standard alignment, i.e., unaligned data access.
-  unsigned alignment = 0;
-
-  unsigned elem = 0; // legal values: {1, 2, 3, 4}
-  if (Name.startswith("VST1") || Name.startswith("VLD1"))
-    elem = 1;
-
-  if (Name.startswith("VST2") || Name.startswith("VLD2"))
-    elem = 2;
-
-  if (Name.startswith("VST3") || Name.startswith("VLD3"))
-    elem = 3;
-
-  if (Name.startswith("VST4") || Name.startswith("VLD4"))
-    elem = 4;
-
-  if (Name.find("LN") != std::string::npos) {
-    // To one lane instructions.
-    // See, for example, 8.6.317 VLD4 (single 4-element structure to one lane).
-
-    // Utility function takes number of elements, size, and index_align.
-    if (!Align4OneLaneInst(elem,
-                           slice(insn, 11, 10),
-                           slice(insn, 7, 4),
-                           alignment))
-      return false;
-
-    // <size> == 16 && Inst{5} == 1 --> DblSpaced = true
-    if (Name.endswith("16") || Name.endswith("16_UPD"))
-      DblSpaced = slice(insn, 5, 5) == 1;
-
-    // <size> == 32 && Inst{6} == 1 --> DblSpaced = true
-    if (Name.endswith("32") || Name.endswith("32_UPD"))
-      DblSpaced = slice(insn, 6, 6) == 1;
-  } else if (Name.find("DUP") != std::string::npos) {
-    // Single element (or structure) to all lanes.
-    // Inst{9-8} encodes the number of element(s) in the structure, with:
-    // 0b00 (VLD1DUP) (for this, a bit makes sense only for data size 16 and 32.
-    // 0b01 (VLD2DUP)
-    // 0b10 (VLD3DUP) (for this, a bit must be encoded as 0)
-    // 0b11 (VLD4DUP)
-    //
-    // Inst{7-6} encodes the data size, with:
-    // 0b00 => 8, 0b01 => 16, 0b10 => 32
-    //
-    // Inst{4} (the a bit) encodes the align action (0: standard alignment)
-    unsigned elem = slice(insn, 9, 8) + 1;
-    unsigned a = slice(insn, 4, 4);
-    if (elem != 3) {
-      // 0b11 is not a valid encoding for Inst{7-6}.
-      if (slice(insn, 7, 6) == 3)
-        return false;
-      unsigned data_size = 8 << slice(insn, 7, 6);
-      // For VLD1DUP, a bit makes sense only for data size of 16 and 32.
-      if (a && data_size == 8)
-        return false;
-
-      // Now we can calculate the alignment!
-      if (a)
-        alignment = elem * data_size;
-    } else {
-      if (a) {
-        // A8.6.315 VLD3 (single 3-element structure to all lanes)
-        // The a bit must be encoded as 0.
-        return false;
-      }
-    }
-  } else {
-    // Multiple n-element structures with type encoded as Inst{11-8}.
-    // See, for example, A8.6.316 VLD4 (multiple 4-element structures).
-
-    // Inst{5-4} encodes alignment.
-    unsigned align = slice(insn, 5, 4);
-    switch (align) {
-    default:
-      break;
-    case 1:
-      alignment = 64; break;
-    case 2:
-      alignment = 128; break;
-    case 3:
-      alignment = 256; break;
-    }
-
-    unsigned type = slice(insn, 11, 8);
-    // Reject UNDEFINED instructions based on type and align.
-    // Plus set DblSpaced flag where appropriate.
-    switch (elem) {
-    default:
-      break;
-    case 1:
-      // n == 1
-      // A8.6.307 & A8.6.391
-      if ((type == 7  && slice(align, 1, 1) == 1) ||
-          (type == 10 && align == 3) ||
-          (type == 6  && slice(align, 1, 1) == 1))
-        return false;
-      break;
-    case 2:
-      // n == 2 && type == 0b1001 -> DblSpaced = true
-      // A8.6.310 & A8.6.393
-      if ((type == 8 || type == 9) && align == 3)
-        return false;
-      DblSpaced = (type == 9);
-      break;
-    case 3:
-      // n == 3 && type == 0b0101 -> DblSpaced = true
-      // A8.6.313 & A8.6.395
-      if (slice(insn, 7, 6) == 3 || slice(align, 1, 1) == 1)
-        return false;
-      DblSpaced = (type == 5);
-      break;
-    case 4:
-      // n == 4 && type == 0b0001 -> DblSpaced = true
-      // A8.6.316 & A8.6.397
-      if (slice(insn, 7, 6) == 3)
-        return false;
-      DblSpaced = (type == 1);
-      break;
-    }
-  }
-  return DisassembleNLdSt0(MI, Opcode, insn, NumOps, NumOpsAdded,
-                           slice(insn, 21, 21) == 0, DblSpaced, alignment/8, B);
-}
-
-// VMOV (immediate)
-//   Qd/Dd imm
-// VBIC (immediate)
-// VORR (immediate)
-//   Qd/Dd imm src(=Qd/Dd)
-static bool DisassembleN1RegModImmFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  assert(NumOps >= 2 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass < 0) &&
-         "Expect 1 reg operand followed by 1 imm operand");
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[0].RegClass,
-                                                     decodeNEONRd(insn))));
-
-  ElemSize esize = ESizeNA;
-  switch (Opcode) {
-  case ARM::VMOVv8i8:
-  case ARM::VMOVv16i8:
-    esize = ESize8;
-    break;
-  case ARM::VMOVv4i16:
-  case ARM::VMOVv8i16:
-  case ARM::VMVNv4i16:
-  case ARM::VMVNv8i16:
-  case ARM::VBICiv4i16:
-  case ARM::VBICiv8i16:
-  case ARM::VORRiv4i16:
-  case ARM::VORRiv8i16:
-    esize = ESize16;
-    break;
-  case ARM::VMOVv2i32:
-  case ARM::VMOVv4i32:
-  case ARM::VMVNv2i32:
-  case ARM::VMVNv4i32:
-  case ARM::VBICiv2i32:
-  case ARM::VBICiv4i32:
-  case ARM::VORRiv2i32:
-  case ARM::VORRiv4i32:
-    esize = ESize32;
-    break;
-  case ARM::VMOVv1i64:
-  case ARM::VMOVv2i64:
-    esize = ESize64;
-    break;
-  default:
-    assert(0 && "Unexpected opcode!");
-    return false;
-  }
-
-  // One register and a modified immediate value.
-  // Add the imm operand.
-  MI.addOperand(MCOperand::CreateImm(decodeN1VImm(insn, esize)));
-
-  NumOpsAdded = 2;
-
-  // VBIC/VORRiv*i* variants have an extra $src = $Vd to be filled in.
-  if (NumOps >= 3 &&
-      (OpInfo[2].RegClass == ARM::DPRRegClassID ||
-       OpInfo[2].RegClass == ARM::QPRRegClassID)) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[0].RegClass,
-                                                     decodeNEONRd(insn))));
-    NumOpsAdded += 1;
-  }
-
-  return true;
-}
-
-namespace {
-enum N2VFlag {
-  N2V_None,
-  N2V_VectorDupLane,
-  N2V_VectorConvert_Between_Float_Fixed
-};
-} // End of unnamed namespace
-
-// Vector Convert [between floating-point and fixed-point]
-//   Qd/Dd Qm/Dm [fbits]
-//
-// Vector Duplicate Lane (from scalar to all elements) Instructions.
-// VDUPLN16d, VDUPLN16q, VDUPLN32d, VDUPLN32q, VDUPLN8d, VDUPLN8q:
-//   Qd/Dd Dm index
-//
-// Vector Move Long:
-//   Qd Dm
-//
-// Vector Move Narrow:
-//   Dd Qm
-//
-// Others
-static bool DisassembleNVdVmOptImm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, N2VFlag Flag, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opc];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  assert(NumOps >= 2 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == ARM::DPRRegClassID ||
-          OpInfo[1].RegClass == ARM::QPRRegClassID) &&
-         "Expect >= 2 operands and first 2 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  ElemSize esize = ESizeNA;
-  if (Flag == N2V_VectorDupLane) {
-    // VDUPLN has its index embedded.  Its size can be inferred from the Opcode.
-    assert(Opc >= ARM::VDUPLN16d && Opc <= ARM::VDUPLN8q &&
-           "Unexpected Opcode");
-    esize = (Opc == ARM::VDUPLN8d || Opc == ARM::VDUPLN8q) ? ESize8
-       : ((Opc == ARM::VDUPLN16d || Opc == ARM::VDUPLN16q) ? ESize16
-                                                           : ESize32);
-  }
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  // VPADAL...
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    // TIED_TO operand.
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Dm = Inst{5:3-0} => NEON Rm
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRm(insn))));
-  ++OpIdx;
-
-  // VZIP and others have two TIED_TO reg operands.
-  int Idx;
-  while (OpIdx < NumOps &&
-         (Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    // Add TIED_TO operand.
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // Add the imm operand, if required.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-
-    unsigned imm = 0xFFFFFFFF;
-
-    if (Flag == N2V_VectorDupLane)
-      imm = decodeNVLaneDupIndex(insn, esize);
-    if (Flag == N2V_VectorConvert_Between_Float_Fixed)
-      imm = decodeVCVTFractionBits(insn);
-
-    assert(imm != 0xFFFFFFFF && "Internal error");
-    MI.addOperand(MCOperand::CreateImm(imm));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool DisassembleN2RegFrm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded,
-                                N2V_None, B);
-}
-static bool DisassembleNVCVTFrm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded,
-                                N2V_VectorConvert_Between_Float_Fixed, B);
-}
-static bool DisassembleNVecDupLnFrm(MCInst &MI, unsigned Opc, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVmOptImm(MI, Opc, insn, NumOps, NumOpsAdded,
-                                N2V_VectorDupLane, B);
-}
-
-// Vector Shift [Accumulate] Instructions.
-// Qd/Dd [Qd/Dd (TIED_TO)] Qm/Dm ShiftAmt
-//
-// Vector Shift Left Long (with maximum shift count) Instructions.
-// VSHLLi16, VSHLLi32, VSHLLi8: Qd Dm imm (== size)
-//
-static bool DisassembleNVectorShift(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, bool LeftShift, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  assert(NumOps >= 3 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == ARM::DPRRegClassID ||
-          OpInfo[1].RegClass == ARM::QPRRegClassID) &&
-         "Expect >= 3 operands and first 2 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    // TIED_TO operand.
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  assert((OpInfo[OpIdx].RegClass == ARM::DPRRegClassID ||
-          OpInfo[OpIdx].RegClass == ARM::QPRRegClassID) &&
-         "Reg operand expected");
-
-  // Qm/Dm = Inst{5:3-0} => NEON Rm
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRm(insn))));
-  ++OpIdx;
-
-  assert(OpInfo[OpIdx].RegClass < 0 && "Imm operand expected");
-
-  // Add the imm operand.
-
-  // VSHLL has maximum shift count as the imm, inferred from its size.
-  unsigned Imm;
-  switch (Opcode) {
-  default:
-    Imm = decodeNVSAmt(insn, LeftShift);
-    break;
-  case ARM::VSHLLi8:
-    Imm = 8;
-    break;
-  case ARM::VSHLLi16:
-    Imm = 16;
-    break;
-  case ARM::VSHLLi32:
-    Imm = 32;
-    break;
-  }
-  MI.addOperand(MCOperand::CreateImm(Imm));
-  ++OpIdx;
-
-  return true;
-}
-
-// Left shift instructions.
-static bool DisassembleN2RegVecShLFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVectorShift(MI, Opcode, insn, NumOps, NumOpsAdded, true,
-                                 B);
-}
-// Right shift instructions have different shift amount interpretation.
-static bool DisassembleN2RegVecShRFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVectorShift(MI, Opcode, insn, NumOps, NumOpsAdded, false,
-                                 B);
-}
-
-namespace {
-enum N3VFlag {
-  N3V_None,
-  N3V_VectorExtract,
-  N3V_VectorShift,
-  N3V_Multiply_By_Scalar
-};
-} // End of unnamed namespace
-
-// NEON Three Register Instructions with Optional Immediate Operand
-//
-// Vector Extract Instructions.
-// Qd/Dd Qn/Dn Qm/Dm imm4
-//
-// Vector Shift (Register) Instructions.
-// Qd/Dd Qm/Dm Qn/Dn (notice the order of m, n)
-//
-// Vector Multiply [Accumulate/Subtract] [Long] By Scalar Instructions.
-// Qd/Dd Qn/Dn RestrictedDm index
-//
-// Others
-static bool DisassembleNVdVnVmOptImm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, N3VFlag Flag, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-
-  // No checking for OpInfo[2] because of MOVDneon/MOVQ with only two regs.
-  assert(NumOps >= 3 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         (OpInfo[1].RegClass == ARM::DPRRegClassID ||
-          OpInfo[1].RegClass == ARM::QPRRegClassID) &&
-         "Expect >= 3 operands and first 2 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  bool VdVnVm = Flag == N3V_VectorShift ? false : true;
-  bool IsImm4 = Flag == N3V_VectorExtract ? true : false;
-  bool IsDmRestricted = Flag == N3V_Multiply_By_Scalar ? true : false;
-  ElemSize esize = ESizeNA;
-  if (Flag == N3V_Multiply_By_Scalar) {
-    unsigned size = (insn >> 20) & 3;
-    if (size == 1) esize = ESize16;
-    if (size == 2) esize = ESize32;
-    assert (esize == ESize16 || esize == ESize32);
-  }
-
-  // Qd/Dd = Inst{22:15-12} => NEON Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  // VABA, VABAL, VBSLd, VBSLq, ...
-  if (MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1) {
-    // TIED_TO operand.
-    MI.addOperand(MCOperand::CreateReg(0));
-    ++OpIdx;
-  }
-
-  // Dn = Inst{7:19-16} => NEON Rn
-  // or
-  // Dm = Inst{5:3-0} => NEON Rm
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                  VdVnVm ? decodeNEONRn(insn)
-                                         : decodeNEONRm(insn))));
-  ++OpIdx;
-
-  // Dm = Inst{5:3-0} => NEON Rm
-  // or
-  // Dm is restricted to D0-D7 if size is 16, D0-D15 otherwise
-  // or
-  // Dn = Inst{7:19-16} => NEON Rn
-  unsigned m = VdVnVm ? (IsDmRestricted ? decodeRestrictedDm(insn, esize)
-                                        : decodeNEONRm(insn))
-                      : decodeNEONRn(insn);
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, OpInfo[OpIdx].RegClass, m)));
-  ++OpIdx;
-
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Add the imm operand.
-    unsigned Imm = 0;
-    if (IsImm4)
-      Imm = decodeN3VImm(insn);
-    else if (IsDmRestricted)
-      Imm = decodeRestrictedDmIndex(insn, esize);
-    else {
-      assert(0 && "Internal error: unreachable code!");
-      return false;
-    }
-
-    MI.addOperand(MCOperand::CreateImm(Imm));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool DisassembleN3RegFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_None, B);
-}
-static bool DisassembleN3RegVecShFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_VectorShift, B);
-}
-static bool DisassembleNVecExtractFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_VectorExtract, B);
-}
-static bool DisassembleNVecMulScalarFrm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  return DisassembleNVdVnVmOptImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  N3V_Multiply_By_Scalar, B);
-}
-
-// Vector Table Lookup
-//
-// VTBL1, VTBX1: Dd [Dd(TIED_TO)] Dn Dm
-// VTBL2, VTBX2: Dd [Dd(TIED_TO)] Dn Dn+1 Dm
-// VTBL3, VTBX3: Dd [Dd(TIED_TO)] Dn Dn+1 Dn+2 Dm
-// VTBL4, VTBX4: Dd [Dd(TIED_TO)] Dn Dn+1 Dn+2 Dn+3 Dm
-static bool DisassembleNVTBLFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::DPRRegClassID &&
-         OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         OpInfo[2].RegClass == ARM::DPRRegClassID &&
-         "Expect >= 3 operands and first 3 as reg operands");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned Rn = decodeNEONRn(insn);
-
-  // {Dn} encoded as len = 0b00
-  // {Dn Dn+1} encoded as len = 0b01
-  // {Dn Dn+1 Dn+2 } encoded as len = 0b10
-  // {Dn Dn+1 Dn+2 Dn+3} encoded as len = 0b11
-  unsigned Len = slice(insn, 9, 8) + 1;
-
-  // Dd (the destination vector)
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRd(insn))));
-  ++OpIdx;
-
-  // Process tied_to operand constraint.
-  int Idx;
-  if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // Do the <list> now.
-  for (unsigned i = 0; i < Len; ++i) {
-    assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::DPRRegClassID &&
-           "Reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                       Rn + i)));
-    ++OpIdx;
-  }
-
-  // Dm (the index vector)
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::DPRRegClassID &&
-         "Reg operand (index vector) expected");
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRm(insn))));
-  ++OpIdx;
-
-  return true;
-}
-
-// Vector Get Lane (move scalar to ARM core register) Instructions.
-// VGETLNi32, VGETLNs16, VGETLNs8, VGETLNu16, VGETLNu8: Rt Dn index
-static bool DisassembleNGetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  assert(MCID.getNumDefs() == 1 && NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         OpInfo[2].RegClass < 0 &&
-         "Expect >= 3 operands with one dst operand");
-
-  ElemSize esize =
-    Opcode == ARM::VGETLNi32 ? ESize32
-      : ((Opcode == ARM::VGETLNs16 || Opcode == ARM::VGETLNu16) ? ESize16
-                                                                : ESize8);
-
-  // Rt = Inst{15-12} => ARM Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  // Dn = Inst{7:19-16} => NEON Rn
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRn(insn))));
-
-  MI.addOperand(MCOperand::CreateImm(decodeNVLaneOpIndex(insn, esize)));
-
-  NumOpsAdded = 3;
-  return true;
-}
-
-// Vector Set Lane (move ARM core register to scalar) Instructions.
-// VSETLNi16, VSETLNi32, VSETLNi8: Dd Dd (TIED_TO) Rt index
-static bool DisassembleNSetLnFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  if (!OpInfo) return false;
-
-  assert(MCID.getNumDefs() == 1 && NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::DPRRegClassID &&
-         OpInfo[1].RegClass == ARM::DPRRegClassID &&
-         MCID.getOperandConstraint(1, MCOI::TIED_TO) != -1 &&
-         OpInfo[2].RegClass == ARM::GPRRegClassID &&
-         OpInfo[3].RegClass < 0 &&
-         "Expect >= 3 operands with one dst operand");
-
-  ElemSize esize =
-    Opcode == ARM::VSETLNi8 ? ESize8
-                            : (Opcode == ARM::VSETLNi16 ? ESize16
-                                                        : ESize32);
-
-  // Dd = Inst{7:19-16} => NEON Rn
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::DPRRegClassID,
-                                                     decodeNEONRn(insn))));
-
-  // TIED_TO operand.
-  MI.addOperand(MCOperand::CreateReg(0));
-
-  // Rt = Inst{15-12} => ARM Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  MI.addOperand(MCOperand::CreateImm(decodeNVLaneOpIndex(insn, esize)));
-
-  NumOpsAdded = 4;
-  return true;
-}
-
-// Vector Duplicate Instructions (from ARM core register to all elements).
-// VDUP8d, VDUP16d, VDUP32d, VDUP8q, VDUP16q, VDUP32q: Qd/Dd Rt
-static bool DisassembleNDupFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  assert(NumOps >= 2 &&
-         (OpInfo[0].RegClass == ARM::DPRRegClassID ||
-          OpInfo[0].RegClass == ARM::QPRRegClassID) &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         "Expect >= 2 operands and first 2 as reg operand");
-
-  unsigned RegClass = OpInfo[0].RegClass;
-
-  // Qd/Dd = Inst{7:19-16} => NEON Rn
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClass,
-                                                     decodeNEONRn(insn))));
-
-  // Rt = Inst{15-12} => ARM Rd
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  NumOpsAdded = 2;
-  return true;
-}
-
-static inline bool PreLoadOpcode(unsigned Opcode) {
-  switch(Opcode) {
-  case ARM::PLDi12:  case ARM::PLDrs:
-  case ARM::PLDWi12: case ARM::PLDWrs:
-  case ARM::PLIi12:  case ARM::PLIrs:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static bool DisassemblePreLoadFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // Preload Data/Instruction requires either 2 or 3 operands.
-  // PLDi12, PLDWi12, PLIi12: addrmode_imm12
-  // PLDrs, PLDWrs, PLIrs:    ldst_so_reg
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-
-  if (Opcode == ARM::PLDi12 || Opcode == ARM::PLDWi12
-      || Opcode == ARM::PLIi12) {
-    unsigned Imm12 = slice(insn, 11, 0);
-    bool Negative = getUBit(insn) == 0;
-
-    // A8.6.118 PLD (literal) PLDWi12 with Rn=PC is transformed to PLDi12.
-    if (Opcode == ARM::PLDWi12 && slice(insn, 19, 16) == 0xF) {
-      DEBUG(errs() << "Rn == '1111': PLDWi12 morphed to PLDi12\n");
-      MI.setOpcode(ARM::PLDi12);
-    }
-    
-    // -0 is represented specially. All other values are as normal.
-    int Offset = Negative ? -1 * Imm12 : Imm12;
-    if (Imm12 == 0 && Negative)
-      Offset = INT32_MIN;
-
-    MI.addOperand(MCOperand::CreateImm(Offset));
-    NumOpsAdded = 2;
-  } else {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-
-    ARM_AM::AddrOpc AddrOpcode = getUBit(insn) ? ARM_AM::add : ARM_AM::sub;
-
-    // Inst{6-5} encodes the shift opcode.
-    ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 6, 5));
-    // Inst{11-7} encodes the imm5 shift amount.
-    unsigned ShImm = slice(insn, 11, 7);
-
-    // A8.4.1.  Possible rrx or shift amount of 32...
-    getImmShiftSE(ShOp, ShImm);
-    MI.addOperand(MCOperand::CreateImm(
-                    ARM_AM::getAM2Opc(AddrOpcode, ShImm, ShOp)));
-    NumOpsAdded = 3;
-  }
-
-  return true;
-}
-
-static bool DisassembleMiscFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (Opcode == ARM::DMB || Opcode == ARM::DSB || Opcode == ARM::ISB) {
-    // Inst{3-0} encodes the memory barrier option for the variants.
-    unsigned opt = slice(insn, 3, 0);
-    switch (opt) {
-    case ARM_MB::SY:  case ARM_MB::ST:
-    case ARM_MB::ISH: case ARM_MB::ISHST:
-    case ARM_MB::NSH: case ARM_MB::NSHST:
-    case ARM_MB::OSH: case ARM_MB::OSHST:
-      MI.addOperand(MCOperand::CreateImm(opt));
-      NumOpsAdded = 1;
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  switch (Opcode) {
-  case ARM::CLREX:
-  case ARM::NOP:
-  case ARM::TRAP:
-  case ARM::YIELD:
-  case ARM::WFE:
-  case ARM::WFI:
-  case ARM::SEV:
-    return true;
-  case ARM::SWP:
-  case ARM::SWPB:
-    // SWP, SWPB: Rd Rm Rn
-    // Delegate to DisassembleLdStExFrm()....
-    return DisassembleLdStExFrm(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-  default:
-    break;
-  }
-
-  if (Opcode == ARM::SETEND) {
-    NumOpsAdded = 1;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 9, 9)));
-    return true;
-  }
-
-  // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different
-  // opcodes which match the same real instruction. This is needed since there's
-  // no current handling of optional arguments. Fix here when a better handling
-  // of optional arguments is implemented.
-  if (Opcode == ARM::CPS3p) {   // M = 1
-    // Let's reject these impossible imod values by returning false:
-    // 1. (imod=0b01)
-    //
-    // AsmPrinter cannot handle imod=0b00, plus (imod=0b00,M=1,iflags!=0) is an
-    // invalid combination, so we just check for imod=0b00 here.
-    if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1)
-      return false;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));   // mode
-    NumOpsAdded = 3;
-    return true;
-  }
-  if (Opcode == ARM::CPS2p) { // mode = 0, M = 0
-    // Let's reject these impossible imod values by returning false:
-    // 1. (imod=0b00,M=0)
-    // 2. (imod=0b01)
-    if (slice(insn, 19, 18) == 0 || slice(insn, 19, 18) == 1)
-      return false;
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 18))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 8, 6)));   // iflags
-    NumOpsAdded = 2;
-    return true;
-  }
-  if (Opcode == ARM::CPS1p) { // imod = 0, iflags = 0, M = 1
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // DBG has its option specified in Inst{3-0}.
-  if (Opcode == ARM::DBG) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // BKPT takes an imm32 val equal to ZeroExtend(Inst{19-8:3-0}).
-  if (Opcode == ARM::BKPT) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 8) << 4 |
-                                       slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  if (PreLoadOpcode(Opcode))
-    return DisassemblePreLoadFrm(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  assert(0 && "Unexpected misc instruction!");
-  return false;
-}
-
-/// FuncPtrs - FuncPtrs maps ARMFormat to its corresponding DisassembleFP.
-/// We divide the disassembly task into different categories, with each one
-/// corresponding to a specific instruction encoding format.  There could be
-/// exceptions when handling a specific format, and that is why the Opcode is
-/// also present in the function prototype.
-static const DisassembleFP FuncPtrs[] = {
-  &DisassemblePseudo,
-  &DisassembleMulFrm,
-  &DisassembleBrFrm,
-  &DisassembleBrMiscFrm,
-  &DisassembleDPFrm,
-  &DisassembleDPSoRegFrm,
-  &DisassembleLdFrm,
-  &DisassembleStFrm,
-  &DisassembleLdMiscFrm,
-  &DisassembleStMiscFrm,
-  &DisassembleLdStMulFrm,
-  &DisassembleLdStExFrm,
-  &DisassembleArithMiscFrm,
-  &DisassembleSatFrm,
-  &DisassembleExtFrm,
-  &DisassembleVFPUnaryFrm,
-  &DisassembleVFPBinaryFrm,
-  &DisassembleVFPConv1Frm,
-  &DisassembleVFPConv2Frm,
-  &DisassembleVFPConv3Frm,
-  &DisassembleVFPConv4Frm,
-  &DisassembleVFPConv5Frm,
-  &DisassembleVFPLdStFrm,
-  &DisassembleVFPLdStMulFrm,
-  &DisassembleVFPMiscFrm,
-  &DisassembleThumbFrm,
-  &DisassembleMiscFrm,
-  &DisassembleNGetLnFrm,
-  &DisassembleNSetLnFrm,
-  &DisassembleNDupFrm,
-
-  // VLD and VST (including one lane) Instructions.
-  &DisassembleNLdSt,
-
-  // A7.4.6 One register and a modified immediate value
-  // 1-Register Instructions with imm.
-  // LLVM only defines VMOVv instructions.
-  &DisassembleN1RegModImmFrm,
-
-  // 2-Register Instructions with no imm.
-  &DisassembleN2RegFrm,
-
-  // 2-Register Instructions with imm (vector convert float/fixed point).
-  &DisassembleNVCVTFrm,
-
-  // 2-Register Instructions with imm (vector dup lane).
-  &DisassembleNVecDupLnFrm,
-
-  // Vector Shift Left Instructions.
-  &DisassembleN2RegVecShLFrm,
-
-  // Vector Shift Righ Instructions, which has different interpretation of the
-  // shift amount from the imm6 field.
-  &DisassembleN2RegVecShRFrm,
-
-  // 3-Register Data-Processing Instructions.
-  &DisassembleN3RegFrm,
-
-  // Vector Shift (Register) Instructions.
-  // D:Vd M:Vm N:Vn (notice that M:Vm is the first operand)
-  &DisassembleN3RegVecShFrm,
-
-  // Vector Extract Instructions.
-  &DisassembleNVecExtractFrm,
-
-  // Vector [Saturating Rounding Doubling] Multiply [Accumulate/Subtract] [Long]
-  // By Scalar Instructions.
-  &DisassembleNVecMulScalarFrm,
-
-  // Vector Table Lookup uses byte indexes in a control vector to look up byte
-  // values in a table and generate a new vector.
-  &DisassembleNVTBLFrm,
-
-  NULL
-};
-
-/// BuildIt - BuildIt performs the build step for this ARM Basic MC Builder.
-/// The general idea is to set the Opcode for the MCInst, followed by adding
-/// the appropriate MCOperands to the MCInst.  ARM Basic MC Builder delegates
-/// to the Format-specific disassemble function for disassembly, followed by
-/// TryPredicateAndSBitModifier() to do PredicateOperand and OptionalDefOperand
-/// which follow the Dst/Src Operands.
-bool ARMBasicMCBuilder::BuildIt(MCInst &MI, uint32_t insn) {
-  // Stage 1 sets the Opcode.
-  MI.setOpcode(Opcode);
-  // If the number of operands is zero, we're done!
-  if (NumOps == 0)
-    return true;
-
-  // Stage 2 calls the format-specific disassemble function to build the operand
-  // list.
-  if (Disasm == NULL)
-    return false;
-  unsigned NumOpsAdded = 0;
-  bool OK = (*Disasm)(MI, Opcode, insn, NumOps, NumOpsAdded, this);
-
-  if (!OK || this->Err != 0) return false;
-  if (NumOpsAdded >= NumOps)
-    return true;
-
-  // Stage 3 deals with operands unaccounted for after stage 2 is finished.
-  // FIXME: Should this be done selectively?
-  return TryPredicateAndSBitModifier(MI, Opcode, insn, NumOps - NumOpsAdded);
-}
-
-// A8.3 Conditional execution
-// A8.3.1 Pseudocode details of conditional execution
-// Condition bits '111x' indicate the instruction is always executed.
-static uint32_t CondCode(uint32_t CondField) {
-  if (CondField == 0xF)
-    return ARMCC::AL;
-  return CondField;
-}
-
-/// DoPredicateOperands - DoPredicateOperands process the predicate operands
-/// of some Thumb instructions which come before the reglist operands.  It
-/// returns true if the two predicate operands have been processed.
-bool ARMBasicMCBuilder::DoPredicateOperands(MCInst& MI, unsigned Opcode,
-    uint32_t /* insn */, unsigned short NumOpsRemaining) {
-
-  assert(NumOpsRemaining > 0 && "Invalid argument");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned Idx = MI.getNumOperands();
-
-  // First, we check whether this instr specifies the PredicateOperand through
-  // a pair of MCOperandInfos with isPredicate() property.
-  if (NumOpsRemaining >= 2 &&
-      OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass < 0 &&
-      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
-  {
-    // If we are inside an IT block, get the IT condition bits maintained via
-    // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
-    // See also A2.5.2.
-    if (InITBlock())
-      MI.addOperand(MCOperand::CreateImm(GetITCond()));
-    else
-      MI.addOperand(MCOperand::CreateImm(ARMCC::AL));
-    MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-    return true;
-  }
-
-  return false;
-}
-
-/// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process
-/// the possible Predicate and SBitModifier, to build the remaining MCOperand
-/// constituents.
-bool ARMBasicMCBuilder::TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOpsRemaining) {
-
-  assert(NumOpsRemaining > 0 && "Invalid argument");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  const std::string &Name = ARMInsts[Opcode].Name;
-  unsigned Idx = MI.getNumOperands();
-  uint64_t TSFlags = ARMInsts[Opcode].TSFlags;
-
-  // First, we check whether this instr specifies the PredicateOperand through
-  // a pair of MCOperandInfos with isPredicate() property.
-  if (NumOpsRemaining >= 2 &&
-      OpInfo[Idx].isPredicate() && OpInfo[Idx+1].isPredicate() &&
-      OpInfo[Idx].RegClass < 0 &&
-      OpInfo[Idx+1].RegClass == ARM::CCRRegClassID)
-  {
-    // If we are inside an IT block, get the IT condition bits maintained via
-    // ARMBasicMCBuilder::ITState[7:0], through ARMBasicMCBuilder::GetITCond().
-    // See also A2.5.2.
-    if (InITBlock())
-      MI.addOperand(MCOperand::CreateImm(GetITCond()));
-    else {
-      if (Name.length() > 1 && Name[0] == 't') {
-        // Thumb conditional branch instructions have their cond field embedded,
-        // like ARM.
-        //
-        // A8.6.16 B
-        // Check for undefined encodings.
-        unsigned cond;
-        if (Name == "t2Bcc") {
-          if ((cond = slice(insn, 25, 22)) >= 14)
-            return false;
-          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
-        } else if (Name == "tBcc") {
-          if ((cond = slice(insn, 11, 8)) == 14)
-            return false;
-          MI.addOperand(MCOperand::CreateImm(CondCode(cond)));
-        } else
-          MI.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      } else {
-        // ARM instructions get their condition field from Inst{31-28}.
-        // We should reject Inst{31-28} = 0b1111 as invalid encoding.
-        if (!isNEONDomain(TSFlags) && getCondField(insn) == 0xF)
-          return false;
-        MI.addOperand(MCOperand::CreateImm(CondCode(getCondField(insn))));
-      }
-    }
-    MI.addOperand(MCOperand::CreateReg(ARM::CPSR));
-    Idx += 2;
-    NumOpsRemaining -= 2;
-  }
-
-  if (NumOpsRemaining == 0)
-    return true;
-
-  // Next, if OptionalDefOperand exists, we check whether the 'S' bit is set.
-  if (OpInfo[Idx].isOptionalDef() && OpInfo[Idx].RegClass==ARM::CCRRegClassID) {
-    MI.addOperand(MCOperand::CreateReg(getSBit(insn) == 1 ? ARM::CPSR : 0));
-    --NumOpsRemaining;
-  }
-
-  if (NumOpsRemaining == 0)
-    return true;
-  else
-    return false;
-}
-
-/// RunBuildAfterHook - RunBuildAfterHook performs operations deemed necessary
-/// after BuildIt is finished.
-bool ARMBasicMCBuilder::RunBuildAfterHook(bool Status, MCInst &MI,
-    uint32_t insn) {
-
-  if (!SP) return Status;
-
-  if (Opcode == ARM::t2IT)
-    Status = SP->InitIT(slice(insn, 7, 0)) ? Status : false;
-  else if (InITBlock())
-    SP->UpdateIT();
-
-  return Status;
-}
-
-/// Opcode, Format, and NumOperands make up an ARM Basic MCBuilder.
-ARMBasicMCBuilder::ARMBasicMCBuilder(unsigned opc, ARMFormat format,
-                                     unsigned short num)
-  : Opcode(opc), Format(format), NumOps(num), SP(0), Err(0) {
-  unsigned Idx = (unsigned)format;
-  assert(Idx < (array_lengthof(FuncPtrs) - 1) && "Unknown format");
-  Disasm = FuncPtrs[Idx];
-}
-
-/// CreateMCBuilder - Return an ARMBasicMCBuilder that can build up the MC
-/// infrastructure of an MCInst given the Opcode and Format of the instr.
-/// Return NULL if it fails to create/return a proper builder.  API clients
-/// are responsible for freeing up of the allocated memory.  Cacheing can be
-/// performed by the API clients to improve performance.
-ARMBasicMCBuilder *llvm::CreateMCBuilder(unsigned Opcode, ARMFormat Format) {
-  // For "Unknown format", fail by returning a NULL pointer.
-  if ((unsigned)Format >= (array_lengthof(FuncPtrs) - 1)) {
-    DEBUG(errs() << "Unknown format\n");
-    return 0;
-  }
-
-  return new ARMBasicMCBuilder(Opcode, Format,
-                               ARMInsts[Opcode].getNumOperands());
-}
-
-/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-/// operand in place of the immediate Value in the MCInst.  The immediate
-/// Value has had any PC adjustment made by the caller.  If the getOpInfo()
-/// function was set as part of the setupBuilderForSymbolicDisassembly() call
-/// then that function is called to get any symbolic information at the
-/// builder's Address for this instrution.  If that returns non-zero then the
-/// symbolic information it returns is used to create an MCExpr and that is
-/// added as an operand to the MCInst.  This function returns true if it adds
-/// an operand to the MCInst and false otherwise.
-bool ARMBasicMCBuilder::tryAddingSymbolicOperand(uint64_t Value,
-                                                 uint64_t InstSize,
-                                                 MCInst &MI) {
-  if (!GetOpInfo)
-    return false;
-
-  struct LLVMOpInfo1 SymbolicOp;
-  SymbolicOp.Value = Value;
-  if (!GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp))
-    return false;
-
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-    if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
-    MI.addOperand(MCOperand::CreateExpr(Expr));
-  else 
-    assert("bad SymbolicOp.VariantKind");
-
-  return true;
-}
diff --git a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h b/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
deleted file mode 100644
index a7ba141..0000000
--- a/lib/Target/ARM/Disassembler/ARMDisassemblerCore.h
+++ /dev/null
@@ -1,336 +0,0 @@
-//===- ARMDisassemblerCore.h - ARM disassembler helpers ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-//
-// The first part defines the enumeration type of ARM instruction format, which
-// specifies the encoding used by the instruction, as well as a helper function
-// to convert the enums to printable char strings.
-//
-// It also contains code to represent the concepts of Builder and DisassembleFP
-// to solve the problem of disassembling an ARM instr.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARMDISASSEMBLERCORE_H
-#define ARMDISASSEMBLERCORE_H
-
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm-c/Disassembler.h"
-#include "ARMBaseInstrInfo.h"
-#include "ARMRegisterInfo.h"
-#include "ARMDisassembler.h"
-
-namespace llvm {
-class MCContext;
-
-class ARMUtils {
-public:
-  static const char *OpcodeName(unsigned Opcode);
-};
-
-/////////////////////////////////////////////////////
-//                                                 //
-//  Enums and Utilities for ARM Instruction Format //
-//                                                 //
-/////////////////////////////////////////////////////
-
-#define ARM_FORMATS                   \
-  ENTRY(ARM_FORMAT_PSEUDO,         0) \
-  ENTRY(ARM_FORMAT_MULFRM,         1) \
-  ENTRY(ARM_FORMAT_BRFRM,          2) \
-  ENTRY(ARM_FORMAT_BRMISCFRM,      3) \
-  ENTRY(ARM_FORMAT_DPFRM,          4) \
-  ENTRY(ARM_FORMAT_DPSOREGFRM,     5) \
-  ENTRY(ARM_FORMAT_LDFRM,          6) \
-  ENTRY(ARM_FORMAT_STFRM,          7) \
-  ENTRY(ARM_FORMAT_LDMISCFRM,      8) \
-  ENTRY(ARM_FORMAT_STMISCFRM,      9) \
-  ENTRY(ARM_FORMAT_LDSTMULFRM,    10) \
-  ENTRY(ARM_FORMAT_LDSTEXFRM,     11) \
-  ENTRY(ARM_FORMAT_ARITHMISCFRM,  12) \
-  ENTRY(ARM_FORMAT_SATFRM,        13) \
-  ENTRY(ARM_FORMAT_EXTFRM,        14) \
-  ENTRY(ARM_FORMAT_VFPUNARYFRM,   15) \
-  ENTRY(ARM_FORMAT_VFPBINARYFRM,  16) \
-  ENTRY(ARM_FORMAT_VFPCONV1FRM,   17) \
-  ENTRY(ARM_FORMAT_VFPCONV2FRM,   18) \
-  ENTRY(ARM_FORMAT_VFPCONV3FRM,   19) \
-  ENTRY(ARM_FORMAT_VFPCONV4FRM,   20) \
-  ENTRY(ARM_FORMAT_VFPCONV5FRM,   21) \
-  ENTRY(ARM_FORMAT_VFPLDSTFRM,    22) \
-  ENTRY(ARM_FORMAT_VFPLDSTMULFRM, 23) \
-  ENTRY(ARM_FORMAT_VFPMISCFRM,    24) \
-  ENTRY(ARM_FORMAT_THUMBFRM,      25) \
-  ENTRY(ARM_FORMAT_MISCFRM,       26) \
-  ENTRY(ARM_FORMAT_NEONGETLNFRM,  27) \
-  ENTRY(ARM_FORMAT_NEONSETLNFRM,  28) \
-  ENTRY(ARM_FORMAT_NEONDUPFRM,    29) \
-  ENTRY(ARM_FORMAT_NLdSt,         30) \
-  ENTRY(ARM_FORMAT_N1RegModImm,   31) \
-  ENTRY(ARM_FORMAT_N2Reg,         32) \
-  ENTRY(ARM_FORMAT_NVCVT,         33) \
-  ENTRY(ARM_FORMAT_NVecDupLn,     34) \
-  ENTRY(ARM_FORMAT_N2RegVecShL,   35) \
-  ENTRY(ARM_FORMAT_N2RegVecShR,   36) \
-  ENTRY(ARM_FORMAT_N3Reg,         37) \
-  ENTRY(ARM_FORMAT_N3RegVecSh,    38) \
-  ENTRY(ARM_FORMAT_NVecExtract,   39) \
-  ENTRY(ARM_FORMAT_NVecMulScalar, 40) \
-  ENTRY(ARM_FORMAT_NVTBL,         41)
-
-// ARM instruction format specifies the encoding used by the instruction.
-#define ENTRY(n, v) n = v,
-typedef enum {
-  ARM_FORMATS
-  ARM_FORMAT_NA
-} ARMFormat;
-#undef ENTRY
-
-// Converts enum to const char*.
-static const inline char *stringForARMFormat(ARMFormat form) {
-#define ENTRY(n, v) case n: return #n;
-  switch(form) {
-    ARM_FORMATS
-  case ARM_FORMAT_NA:
-  default:
-    return "";
-  }
-#undef ENTRY
-}
-
-/// Expands on the enum definitions from ARMBaseInstrInfo.h.
-/// They are being used by the disassembler implementation.
-namespace ARMII {
-  enum {
-    NEONRegMask = 15,
-    GPRRegMask = 15,
-    NEON_RegRdShift = 12,
-    NEON_D_BitShift = 22,
-    NEON_RegRnShift = 16,
-    NEON_N_BitShift = 7,
-    NEON_RegRmShift = 0,
-    NEON_M_BitShift = 5
-  };
-}
-
-/// Utility function for extracting [From, To] bits from a uint32_t.
-static inline unsigned slice(uint32_t Bits, unsigned From, unsigned To) {
-  assert(From < 32 && To < 32 && From >= To);
-  return (Bits >> To) & ((1 << (From - To + 1)) - 1);
-}
-
-/// Utility function for setting [From, To] bits to Val for a uint32_t.
-static inline void setSlice(unsigned &Bits, unsigned From, unsigned To,
-                            unsigned Val) {
-  assert(From < 32 && To < 32 && From >= To);
-  uint32_t Mask = ((1 << (From - To + 1)) - 1);
-  Bits &= ~(Mask << To);
-  Bits |= (Val & Mask) << To;
-}
-
-// Return an integer result equal to the number of bits of x that are ones.
-static inline uint32_t
-BitCount (uint64_t x)
-{
-    // c accumulates the total bits set in x
-    uint32_t c;
-    for (c = 0; x; ++c)
-    {
-        x &= x - 1; // clear the least significant bit set
-    }
-    return c;
-}
-
-static inline bool
-BitIsSet (const uint64_t value, const uint64_t bit)
-{
-    return (value & (1ull << bit)) != 0;
-}
-
-static inline bool
-BitIsClear (const uint64_t value, const uint64_t bit)
-{
-    return (value & (1ull << bit)) == 0;
-}
-
-/// Various utilities for checking the target specific flags.
-
-/// A unary data processing instruction doesn't have an Rn operand.
-static inline bool isUnaryDP(uint64_t TSFlags) {
-  return (TSFlags & ARMII::UnaryDP);
-}
-
-/// A NEON Domain instruction has cond field (Inst{31-28}) as 0b1111.
-static inline bool isNEONDomain(uint64_t TSFlags) {
-  return (TSFlags & ARMII::DomainNEON) ||
-         (TSFlags & ARMII::DomainNEONA8);
-}
-
-/// This four-bit field describes the addressing mode used.
-/// See also ARMBaseInstrInfo.h.
-static inline unsigned getAddrMode(uint64_t TSFlags) {
-  return (TSFlags & ARMII::AddrModeMask);
-}
-
-/// {IndexModePre, IndexModePost}
-/// Only valid for load and store ops.
-/// See also ARMBaseInstrInfo.h.
-static inline unsigned getIndexMode(uint64_t TSFlags) {
-  return (TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift;
-}
-
-/// Pre-/post-indexed operations define an extra $base_wb in the OutOperandList.
-static inline bool isPrePostLdSt(uint64_t TSFlags) {
-  return (TSFlags & ARMII::IndexModeMask) != 0;
-}
-
-// Forward declaration.
-class ARMBasicMCBuilder;
-
-// Builder Object is mostly ignored except in some Thumb disassemble functions.
-typedef ARMBasicMCBuilder *BO;
-
-/// DisassembleFP - DisassembleFP points to a function that disassembles an insn
-/// and builds the MCOperand list upon disassembly.  It returns false on failure
-/// or true on success.  The number of operands added is updated upon success.
-typedef bool (*DisassembleFP)(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO Builder);
-
-/// CreateMCBuilder - Return an ARMBasicMCBuilder that can build up the MC
-/// infrastructure of an MCInst given the Opcode and Format of the instr.
-/// Return NULL if it fails to create/return a proper builder.  API clients
-/// are responsible for freeing up of the allocated memory.  Cacheing can be
-/// performed by the API clients to improve performance.
-extern ARMBasicMCBuilder *CreateMCBuilder(unsigned Opcode, ARMFormat Format);
-
-/// ARMBasicMCBuilder - ARMBasicMCBuilder represents an ARM MCInst builder that
-/// knows how to build up the MCOperand list.
-class ARMBasicMCBuilder {
-  friend ARMBasicMCBuilder *CreateMCBuilder(unsigned Opcode, ARMFormat Format);
-  unsigned Opcode;
-  ARMFormat Format;
-  unsigned short NumOps;
-  DisassembleFP Disasm;
-  Session *SP;
-  int Err; // !=0 if the builder encounters some error condition during build.
-
-private:
-  /// Opcode, Format, and NumOperands make up an ARM Basic MCBuilder.
-  ARMBasicMCBuilder(unsigned opc, ARMFormat format, unsigned short num);
-
-public:
-  ARMBasicMCBuilder(ARMBasicMCBuilder &B)
-    : Opcode(B.Opcode), Format(B.Format), NumOps(B.NumOps), Disasm(B.Disasm),
-      SP(B.SP), GetOpInfo(0), DisInfo(0), Ctx(0) {
-    Err = 0;
-  }
-
-  virtual ~ARMBasicMCBuilder() {}
-
-  void SetSession(Session *sp) {
-    SP = sp;
-  }
-
-  void SetErr(int ErrCode) {
-    Err = ErrCode;
-  }
-
-  /// DoPredicateOperands - DoPredicateOperands process the predicate operands
-  /// of some Thumb instructions which come before the reglist operands.  It
-  /// returns true if the two predicate operands have been processed.
-  bool DoPredicateOperands(MCInst& MI, unsigned Opcode,
-      uint32_t insn, unsigned short NumOpsRemaning);
-  
-  /// TryPredicateAndSBitModifier - TryPredicateAndSBitModifier tries to process
-  /// the possible Predicate and SBitModifier, to build the remaining MCOperand
-  /// constituents.
-  bool TryPredicateAndSBitModifier(MCInst& MI, unsigned Opcode,
-      uint32_t insn, unsigned short NumOpsRemaning);
-
-  /// InITBlock - InITBlock returns true if we are inside an IT block.
-  bool InITBlock() {
-    if (SP)
-      return SP->ITCounter > 0;
-
-    return false;
-  }
-
-  /// Build - Build delegates to BuildIt to perform the heavy liftling.  After
-  /// that, it invokes RunBuildAfterHook where some housekeepings can be done.
-  virtual bool Build(MCInst &MI, uint32_t insn) {
-    bool Status = BuildIt(MI, insn);
-    return RunBuildAfterHook(Status, MI, insn);
-  }
-
-  /// BuildIt - BuildIt performs the build step for this ARM Basic MC Builder.
-  /// The general idea is to set the Opcode for the MCInst, followed by adding
-  /// the appropriate MCOperands to the MCInst.  ARM Basic MC Builder delegates
-  /// to the Format-specific disassemble function for disassembly, followed by
-  /// TryPredicateAndSBitModifier() for PredicateOperand and OptionalDefOperand
-  /// which follow the Dst/Src Operands.
-  virtual bool BuildIt(MCInst &MI, uint32_t insn);
-
-  /// RunBuildAfterHook - RunBuildAfterHook performs operations deemed necessary
-  /// after BuildIt is finished.
-  virtual bool RunBuildAfterHook(bool Status, MCInst &MI, uint32_t insn);
-
-private:
-  /// Get condition of the current IT instruction.
-  unsigned GetITCond() {
-    assert(SP);
-    return slice(SP->ITState, 7, 4);
-  }
-
-private:
-  //
-  // Hooks for symbolic disassembly via the public 'C' interface.
-  //
-  // The function to get the symbolic information for operands.
-  LLVMOpInfoCallback GetOpInfo;
-  // The pointer to the block of symbolic information for above call back.
-  void *DisInfo;
-  // The assembly context for creating symbols and MCExprs in place of
-  // immediate operands when there is symbolic information.
-  MCContext *Ctx;
-  // The address of the instruction being disassembled.
-  uint64_t Address;
-
-public:
-  void setupBuilderForSymbolicDisassembly(LLVMOpInfoCallback getOpInfo,
-                                          void *disInfo, MCContext *ctx,
-                                          uint64_t address) {
-    GetOpInfo = getOpInfo;
-    DisInfo = disInfo;
-    Ctx = ctx;
-    Address = address;
-  }
-
-  uint64_t getBuilderAddress() const { return Address; }
-
-  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-  /// operand in place of the immediate Value in the MCInst.  The immediate
-  /// Value has had any PC adjustment made by the caller.  If the getOpInfo()
-  /// function was set as part of the setupBuilderForSymbolicDisassembly() call
-  /// then that function is called to get any symbolic information at the
-  /// builder's Address for this instrution.  If that returns non-zero then the
-  /// symbolic information it returns is used to create an MCExpr and that is
-  /// added as an operand to the MCInst.  This function returns true if it adds
-  /// an operand to the MCInst and false otherwise.
-  bool tryAddingSymbolicOperand(uint64_t Value, uint64_t InstSize, MCInst &MI);
-
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM/Disassembler/CMakeLists.txt b/lib/Target/ARM/Disassembler/CMakeLists.txt
index b23dd6b..da87751 100644
--- a/lib/Target/ARM/Disassembler/CMakeLists.txt
+++ b/lib/Target/ARM/Disassembler/CMakeLists.txt
@@ -2,7 +2,6 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 
 add_llvm_library(LLVMARMDisassembler
   ARMDisassembler.cpp
-  ARMDisassemblerCore.cpp
   )
 # workaround for hanging compilation on MSVC8, 9 and 10
 if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
@@ -11,4 +10,12 @@ set_property(
   PROPERTY COMPILE_FLAGS "/Od"
   )
 endif()
-add_dependencies(LLVMARMDisassembler ARMCodeGenTable_gen)
+add_dependencies(LLVMARMDisassembler ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMDisassembler
+  LLVMARMCodeGen
+  LLVMARMDesc
+  LLVMARMInfo
+  LLVMMC
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h b/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
deleted file mode 100644
index 834c6f6..0000000
--- a/lib/Target/ARM/Disassembler/ThumbDisassemblerCore.h
+++ /dev/null
@@ -1,2459 +0,0 @@
-//===- ThumbDisassemblerCore.h - Thumb disassembler helpers -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the ARM Disassembler.
-// It contains code for disassembling a Thumb instr.  It is to be included by
-// ARMDisassemblerCore.cpp because it contains the static DisassembleThumbFrm()
-// function which acts as the dispatcher to disassemble a Thumb instruction.
-//
-//===----------------------------------------------------------------------===//
-
-///////////////////////////////
-//                           //
-//     Utility Functions     //
-//                           //
-///////////////////////////////
-
-// Utilities for 16-bit Thumb instructions.
-/*
-15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0
-               [  tRt ]
-                      [ tRm ]  [ tRn ]  [ tRd ]
-                         D  [   Rm   ]  [  Rd ]
-
-                      [ imm3]
-               [    imm5    ]
-                   i     [    imm5   ]
-                            [       imm7      ]
-                         [       imm8         ]
-               [             imm11            ]
-
-            [   cond  ]
-*/
-
-// Extract tRt: Inst{10-8}.
-static inline unsigned getT1tRt(uint32_t insn) {
-  return slice(insn, 10, 8);
-}
-
-// Extract tRm: Inst{8-6}.
-static inline unsigned getT1tRm(uint32_t insn) {
-  return slice(insn, 8, 6);
-}
-
-// Extract tRn: Inst{5-3}.
-static inline unsigned getT1tRn(uint32_t insn) {
-  return slice(insn, 5, 3);
-}
-
-// Extract tRd: Inst{2-0}.
-static inline unsigned getT1tRd(uint32_t insn) {
-  return slice(insn, 2, 0);
-}
-
-// Extract [D:Rd]: Inst{7:2-0}.
-static inline unsigned getT1Rd(uint32_t insn) {
-  return slice(insn, 7, 7) << 3 | slice(insn, 2, 0);
-}
-
-// Extract Rm: Inst{6-3}.
-static inline unsigned getT1Rm(uint32_t insn) {
-  return slice(insn, 6, 3);
-}
-
-// Extract imm3: Inst{8-6}.
-static inline unsigned getT1Imm3(uint32_t insn) {
-  return slice(insn, 8, 6);
-}
-
-// Extract imm5: Inst{10-6}.
-static inline unsigned getT1Imm5(uint32_t insn) {
-  return slice(insn, 10, 6);
-}
-
-// Extract i:imm5: Inst{9:7-3}.
-static inline unsigned getT1Imm6(uint32_t insn) {
-  return slice(insn, 9, 9) << 5 | slice(insn, 7, 3);
-}
-
-// Extract imm7: Inst{6-0}.
-static inline unsigned getT1Imm7(uint32_t insn) {
-  return slice(insn, 6, 0);
-}
-
-// Extract imm8: Inst{7-0}.
-static inline unsigned getT1Imm8(uint32_t insn) {
-  return slice(insn, 7, 0);
-}
-
-// Extract imm11: Inst{10-0}.
-static inline unsigned getT1Imm11(uint32_t insn) {
-  return slice(insn, 10, 0);
-}
-
-// Extract cond: Inst{11-8}.
-static inline unsigned getT1Cond(uint32_t insn) {
-  return slice(insn, 11, 8);
-}
-
-static inline bool IsGPR(unsigned RegClass) {
-  return RegClass == ARM::GPRRegClassID || RegClass == ARM::rGPRRegClassID;
-}
-
-// Utilities for 32-bit Thumb instructions.
-
-static inline bool BadReg(uint32_t n) { return n == 13 || n == 15; }
-
-// Extract imm4: Inst{19-16}.
-static inline unsigned getImm4(uint32_t insn) {
-  return slice(insn, 19, 16);
-}
-
-// Extract imm3: Inst{14-12}.
-static inline unsigned getImm3(uint32_t insn) {
-  return slice(insn, 14, 12);
-}
-
-// Extract imm8: Inst{7-0}.
-static inline unsigned getImm8(uint32_t insn) {
-  return slice(insn, 7, 0);
-}
-
-// A8.6.61 LDRB (immediate, Thumb) and friends
-// +/-: Inst{9}
-// imm8: Inst{7-0}
-static inline int decodeImm8(uint32_t insn) {
-  int Offset = getImm8(insn);
-  return slice(insn, 9, 9) ? Offset : -Offset;
-}
-
-// Extract imm12: Inst{11-0}.
-static inline unsigned getImm12(uint32_t insn) {
-  return slice(insn, 11, 0);
-}
-
-// A8.6.63 LDRB (literal) and friends
-// +/-: Inst{23}
-// imm12: Inst{11-0}
-static inline int decodeImm12(uint32_t insn) {
-  int Offset = getImm12(insn);
-  return slice(insn, 23, 23) ? Offset : -Offset;
-}
-
-// Extract imm2: Inst{7-6}.
-static inline unsigned getImm2(uint32_t insn) {
-  return slice(insn, 7, 6);
-}
-
-// For BFI, BFC, t2SBFX, and t2UBFX.
-// Extract lsb: Inst{14-12:7-6}.
-static inline unsigned getLsb(uint32_t insn) {
-  return getImm3(insn) << 2 | getImm2(insn);
-}
-
-// For BFI and BFC.
-// Extract msb: Inst{4-0}.
-static inline unsigned getMsb(uint32_t insn) {
-  return slice(insn, 4, 0);
-}
-
-// For t2SBFX and t2UBFX.
-// Extract widthminus1: Inst{4-0}.
-static inline unsigned getWidthMinus1(uint32_t insn) {
-  return slice(insn, 4, 0);
-}
-
-// For t2ADDri12 and t2SUBri12.
-// imm12 = i:imm3:imm8;
-static inline unsigned getIImm3Imm8(uint32_t insn) {
-  return slice(insn, 26, 26) << 11 | getImm3(insn) << 8 | getImm8(insn);
-}
-
-// For t2MOVi16 and t2MOVTi16.
-// imm16 = imm4:i:imm3:imm8;
-static inline unsigned getImm16(uint32_t insn) {
-  return getImm4(insn) << 12 | slice(insn, 26, 26) << 11 |
-    getImm3(insn) << 8 | getImm8(insn);
-}
-
-// Inst{5-4} encodes the shift type.
-static inline unsigned getShiftTypeBits(uint32_t insn) {
-  return slice(insn, 5, 4);
-}
-
-// Inst{14-12}:Inst{7-6} encodes the imm5 shift amount.
-static inline unsigned getShiftAmtBits(uint32_t insn) {
-  return getImm3(insn) << 2 | getImm2(insn);
-}
-
-// A8.6.17 BFC
-// Encoding T1 ARMv6T2, ARMv7
-// LLVM-specific encoding for #<lsb> and #<width>
-static inline bool getBitfieldInvMask(uint32_t insn, uint32_t &mask) {
-  uint32_t lsb = getImm3(insn) << 2 | getImm2(insn);
-  uint32_t msb = getMsb(insn);
-  uint32_t Val = 0;
-  if (msb < lsb) {
-    DEBUG(errs() << "Encoding error: msb < lsb\n");
-    return false;
-  }
-  for (uint32_t i = lsb; i <= msb; ++i)
-    Val |= (1 << i);
-  mask = ~Val;
-  return true;
-}
-
-// A8.4 Shifts applied to a register
-// A8.4.1 Constant shifts
-// A8.4.3 Pseudocode details of instruction-specified shifts and rotates
-//
-// decodeImmShift() returns the shift amount and the the shift opcode.
-// Note that, as of Jan-06-2010, LLVM does not support rrx shifted operands yet.
-static inline unsigned decodeImmShift(unsigned bits2, unsigned imm5,
-                                      ARM_AM::ShiftOpc &ShOp) {
-
-  assert(imm5 < 32 && "Invalid imm5 argument");
-  switch (bits2) {
-  default: assert(0 && "No such value");
-  case 0:
-    ShOp = (imm5 == 0 ? ARM_AM::no_shift : ARM_AM::lsl);
-    return imm5;
-  case 1:
-    ShOp = ARM_AM::lsr;
-    return (imm5 == 0 ? 32 : imm5);
-  case 2:
-    ShOp = ARM_AM::asr;
-    return (imm5 == 0 ? 32 : imm5);
-  case 3:
-    ShOp = (imm5 == 0 ? ARM_AM::rrx : ARM_AM::ror);
-    return (imm5 == 0 ? 1 : imm5);
-  }
-}
-
-// A6.3.2 Modified immediate constants in Thumb instructions
-//
-// ThumbExpandImm() returns the modified immediate constant given an imm12 for
-// Thumb data-processing instructions with modified immediate.
-// See also A6.3.1 Data-processing (modified immediate).
-static inline unsigned ThumbExpandImm(unsigned imm12) {
-  assert(imm12 <= 0xFFF && "Invalid imm12 argument");
-
-  // If the leading two bits is 0b00, the modified immediate constant is
-  // obtained by splatting the low 8 bits into the first byte, every other byte,
-  // or every byte of a 32-bit value.
-  //
-  // Otherwise, a rotate right of '1':imm12<6:0> by the amount imm12<11:7> is
-  // performed.
-
-  if (slice(imm12, 11, 10) == 0) {
-    unsigned short control = slice(imm12, 9, 8);
-    unsigned imm8 = slice(imm12, 7, 0);
-    switch (control) {
-    default:
-      assert(0 && "No such value");
-      return 0;
-    case 0:
-      return imm8;
-    case 1:
-      return imm8 << 16 | imm8;
-    case 2:
-      return imm8 << 24 | imm8 << 8;
-    case 3:
-      return imm8 << 24 | imm8 << 16 | imm8 << 8 | imm8;
-    }
-  } else {
-    // A rotate is required.
-    unsigned Val = 1 << 7 | slice(imm12, 6, 0);
-    unsigned Amt = slice(imm12, 11, 7);
-    return ARM_AM::rotr32(Val, Amt);
-  }
-}
-
-static inline int decodeImm32_B_EncodingT3(uint32_t insn) {
-  bool S = slice(insn, 26, 26);
-  bool J1 = slice(insn, 13, 13);
-  bool J2 = slice(insn, 11, 11);
-  unsigned Imm21 = slice(insn, 21, 16) << 12 | slice(insn, 10, 0) << 1;
-  if (S) Imm21 |= 1 << 20;
-  if (J2) Imm21 |= 1 << 19;
-  if (J1) Imm21 |= 1 << 18;
-
-  return SignExtend32<21>(Imm21);
-}
-
-static inline int decodeImm32_B_EncodingT4(uint32_t insn) {
-  unsigned S = slice(insn, 26, 26);
-  bool I1 = slice(insn, 13, 13) == S;
-  bool I2 = slice(insn, 11, 11) == S;
-  unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 0) << 1;
-  if (S) Imm25 |= 1 << 24;
-  if (I1) Imm25 |= 1 << 23;
-  if (I2) Imm25 |= 1 << 22;
-
-  return SignExtend32<25>(Imm25);
-}
-
-static inline int decodeImm32_BL(uint32_t insn) {
-  unsigned S = slice(insn, 26, 26);
-  bool I1 = slice(insn, 13, 13) == S;
-  bool I2 = slice(insn, 11, 11) == S;
-  unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 0) << 1;
-  if (S) Imm25 |= 1 << 24;
-  if (I1) Imm25 |= 1 << 23;
-  if (I2) Imm25 |= 1 << 22;
-
-  return SignExtend32<25>(Imm25);
-}
-
-static inline int decodeImm32_BLX(uint32_t insn) {
-  unsigned S = slice(insn, 26, 26);
-  bool I1 = slice(insn, 13, 13) == S;
-  bool I2 = slice(insn, 11, 11) == S;
-  unsigned Imm25 = slice(insn, 25, 16) << 12 | slice(insn, 10, 1) << 2;
-  if (S) Imm25 |= 1 << 24;
-  if (I1) Imm25 |= 1 << 23;
-  if (I2) Imm25 |= 1 << 22;
-
-  return SignExtend32<25>(Imm25);
-}
-
-// See, for example, A8.6.221 SXTAB16.
-static inline unsigned decodeRotate(uint32_t insn) {
-  unsigned rotate = slice(insn, 5, 4);
-  return rotate << 3;
-}
-
-///////////////////////////////////////////////
-//                                           //
-// Thumb1 instruction disassembly functions. //
-//                                           //
-///////////////////////////////////////////////
-
-// See "Utilities for 16-bit Thumb instructions" for register naming convention.
-
-// A6.2.1 Shift (immediate), add, subtract, move, and compare
-//
-// shift immediate:         tRd CPSR tRn imm5
-// add/sub register:        tRd CPSR tRn tRm
-// add/sub 3-bit immediate: tRd CPSR tRn imm3
-// add/sub 8-bit immediate: tRt CPSR tRt(TIED_TO) imm8
-// mov/cmp immediate:       tRt [CPSR] imm8 (CPSR present for mov)
-//
-// Special case:
-// tMOVSr:                  tRd tRn
-static bool DisassembleThumb1General(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID
-         && "Invalid arguments");
-
-  bool Imm3 = (Opcode == ARM::tADDi3 || Opcode == ARM::tSUBi3);
-
-  // Use Rt implies use imm8.
-  bool UseRt = (Opcode == ARM::tADDi8 || Opcode == ARM::tSUBi8 ||
-                Opcode == ARM::tMOVi8 || Opcode == ARM::tCMPi8);
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, ARM::tGPRRegClassID,
-                                  UseRt ? getT1tRt(insn) : getT1tRd(insn))));
-  ++OpIdx;
-
-  // Check whether the next operand to be added is a CCR Register.
-  if (OpInfo[OpIdx].RegClass == ARM::CCRRegClassID) {
-    assert(OpInfo[OpIdx].isOptionalDef() && "Optional def operand expected");
-    MI.addOperand(MCOperand::CreateReg(B->InITBlock() ? 0 : ARM::CPSR));
-    ++OpIdx;
-  }
-
-  // Check whether the next operand to be added is a Thumb1 Register.
-  assert(OpIdx < NumOps && "More operands expected");
-  if (OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) {
-    // For UseRt, the reg operand is tied to the first reg operand.
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, ARM::tGPRRegClassID,
-                                    UseRt ? getT1tRt(insn) : getT1tRn(insn))));
-    ++OpIdx;
-  }
-
-  // Special case for tMOVSr.
-  if (OpIdx == NumOps)
-    return true;
-
-  // The next available operand is either a reg operand or an imm operand.
-  if (OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) {
-    // Three register operand instructions.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRm(insn))));
-  } else {
-    assert(OpInfo[OpIdx].RegClass < 0 &&
-           !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
-           && "Pure imm operand expected");
-    unsigned Imm = 0;
-    if (UseRt)
-      Imm = getT1Imm8(insn);
-    else if (Imm3)
-      Imm = getT1Imm3(insn);
-    else {
-      Imm = getT1Imm5(insn);
-      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 12, 11));
-      getImmShiftSE(ShOp, Imm);
-    }
-    MI.addOperand(MCOperand::CreateImm(Imm));
-  }
-  ++OpIdx;
-
-  return true;
-}
-
-// A6.2.2 Data-processing
-//
-// tCMPr, tTST, tCMN: tRd tRn
-// tMVN, tRSB:        tRd CPSR tRn
-// Others:            tRd CPSR tRd(TIED_TO) tRn
-static bool DisassembleThumb1DP(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass == ARM::CCRRegClassID
-          || OpInfo[1].RegClass == ARM::tGPRRegClassID)
-         && "Invalid arguments");
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRd(insn))));
-  ++OpIdx;
-
-  // Check whether the next operand to be added is a CCR Register.
-  if (OpInfo[OpIdx].RegClass == ARM::CCRRegClassID) {
-    assert(OpInfo[OpIdx].isOptionalDef() && "Optional def operand expected");
-    MI.addOperand(MCOperand::CreateReg(B->InITBlock() ? 0 : ARM::CPSR));
-    ++OpIdx;
-  }
-
-  // We have either { tRd(TIED_TO), tRn } or { tRn } remaining.
-  // Process the TIED_TO operand first.
-
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID
-         && "Thumb reg operand expected");
-  int Idx;
-  if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    // The reg operand is tied to the first reg operand.
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // Process possible next reg operand.
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID) {
-    // Add tRn operand.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRn(insn))));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.2.3 Special data instructions and branch and exchange
-//
-// tADDhirr: Rd Rd(TIED_TO) Rm
-// tCMPhir:  Rd Rm
-// tMOVr, tMOVgpr2gpr, tMOVgpr2tgpr, tMOVtgpr2gpr: Rd|tRd Rm|tRn
-// tBX: Rm
-// tBX_RET: 0 operand
-// tBX_RET_vararg: Rm
-// tBLXr_r9: Rm
-// tBRIND: Rm
-static bool DisassembleThumb1Special(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // tBX_RET has 0 operand.
-  if (NumOps == 0)
-    return true;
-
-  // BX/BLX/tBRIND (indirect branch, i.e, mov pc, Rm) has 1 reg operand: Rm.
-  if (Opcode==ARM::tBLXr_r9 || Opcode==ARM::tBX || Opcode==ARM::tBRIND) {
-    if (Opcode == ARM::tBLXr_r9) {
-      // Handling the two predicate operands before the reg operand.
-      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-        return false;
-      NumOpsAdded += 2;
-    }
-
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       getT1Rm(insn))));
-    NumOpsAdded += 1;
-
-    if (Opcode == ARM::tBX) {
-      // Handling the two predicate operands after the reg operand.
-      if (!B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-        return false;
-      NumOpsAdded += 2;
-    }
-
-    return true;
-  }
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  // Add the destination operand.
-  unsigned RegClass = OpInfo[OpIdx].RegClass;
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass,
-                                  IsGPR(RegClass) ? getT1Rd(insn)
-                                                  : getT1tRd(insn))));
-  ++OpIdx;
-
-  // We have either { Rd(TIED_TO), Rm } or { Rm|tRn } remaining.
-  // Process the TIED_TO operand first.
-
-  assert(OpIdx < NumOps && "More operands expected");
-  int Idx;
-  if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-    // The reg operand is tied to the first reg operand.
-    MI.addOperand(MI.getOperand(Idx));
-    ++OpIdx;
-  }
-
-  // The next reg operand is either Rm or tRn.
-  assert(OpIdx < NumOps && "More operands expected");
-  RegClass = OpInfo[OpIdx].RegClass;
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RegClass,
-                                  IsGPR(RegClass) ? getT1Rm(insn)
-                                                  : getT1tRn(insn))));
-  ++OpIdx;
-
-  return true;
-}
-
-// A8.6.59 LDR (literal)
-//
-// tLDRpci: tRt imm8*4
-static bool DisassembleThumb1LdPC(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass < 0 &&
-          !OpInfo[1].isPredicate() &&
-          !OpInfo[1].isOptionalDef())
-         && "Invalid arguments");
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-
-  // And the (imm8 << 2) operand.
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn) << 2));
-
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-// Thumb specific addressing modes (see ARMInstrThumb.td):
-//
-// t_addrmode_rr := reg + reg
-//
-// t_addrmode_s4 := reg + reg
-//                  reg + imm5 * 4
-//
-// t_addrmode_s2 := reg + reg
-//                  reg + imm5 * 2
-//
-// t_addrmode_s1 := reg + reg
-//                  reg + imm5
-//
-// t_addrmode_sp := sp + imm8 * 4
-//
-
-// A8.6.63 LDRB (literal)
-// A8.6.79 LDRSB (literal)
-// A8.6.75 LDRH (literal)
-// A8.6.83 LDRSH (literal)
-// A8.6.59 LDR (literal)
-//
-// These instrs calculate an address from the PC value and an immediate offset.
-// Rd Rn=PC (+/-)imm12 (+ if Inst{23} == 0b1)
-static bool DisassembleThumb2Ldpci(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 2 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         OpInfo[1].RegClass < 0 &&
-         "Expect >= 2 operands, first as reg, and second as imm operand");
-
-  // Build the register operand, followed by the (+/-)imm12 immediate.
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRd(insn))));
-
-  MI.addOperand(MCOperand::CreateImm(decodeImm12(insn)));
-
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-
-// A6.2.4 Load/store single data item
-//
-// Load/Store Register (reg|imm):      tRd tRn imm5|tRm
-// Load Register Signed Byte|Halfword: tRd tRn tRm
-static bool DisassembleThumb1LdSt(unsigned opA, MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass == ARM::tGPRRegClassID
-         && OpInfo[1].RegClass == ARM::tGPRRegClassID
-         && "Expect >= 2 operands and first two as thumb reg operands");
-
-  // Add the destination reg and the base reg.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRn(insn))));
-  OpIdx = 2;
-
-  // We have either { imm5 } or { tRm } remaining.
-  // Note that STR/LDR (register) should skip the imm5 offset operand for
-  // t_addrmode_s[1|2|4].
-
-  assert(OpIdx < NumOps && "More operands expected");
-
-  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate() &&
-      !OpInfo[OpIdx].isOptionalDef()) {
-    // Table A6-5 16-bit Thumb Load/store instructions
-    // opA = 0b0101 for STR/LDR (register) and friends.
-    // Otherwise, we have STR/LDR (immediate) and friends.
-    assert(opA != 5 && "Immediate operand expected for this opcode");
-    MI.addOperand(MCOperand::CreateImm(getT1Imm5(insn)));
-    ++OpIdx;
-  } else {
-    // The next reg operand is tRm, the offset.
-    assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass == ARM::tGPRRegClassID
-           && "Thumb reg operand expected");
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRm(insn))));
-    ++OpIdx;
-  }
-  return true;
-}
-
-// A6.2.4 Load/store single data item
-//
-// Load/Store Register SP relative: tRt ARM::SP imm8
-static bool DisassembleThumb1LdStSP(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert((Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi)
-         && "Unexpected opcode");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass < 0 &&
-          !OpInfo[2].isPredicate() &&
-          !OpInfo[2].isOptionalDef())
-         && "Invalid arguments");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-  MI.addOperand(MCOperand::CreateReg(ARM::SP));
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn)));
-  NumOpsAdded = 3;
-  return true;
-}
-
-// Table A6-1 16-bit Thumb instruction encoding
-// A8.6.10 ADR
-//
-// tADDrPCi: tRt imm8
-static bool DisassembleThumb1AddPCi(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(Opcode == ARM::tADDrPCi && "Unexpected opcode");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass < 0 &&
-          !OpInfo[1].isPredicate() &&
-          !OpInfo[1].isOptionalDef())
-         && "Invalid arguments");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn)));
-  NumOpsAdded = 2;
-  return true;
-}
-
-// Table A6-1 16-bit Thumb instruction encoding
-// A8.6.8 ADD (SP plus immediate)
-//
-// tADDrSPi: tRt ARM::SP imm8
-static bool DisassembleThumb1AddSPi(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(Opcode == ARM::tADDrSPi && "Unexpected opcode");
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::GPRRegClassID &&
-         (OpInfo[2].RegClass < 0 &&
-          !OpInfo[2].isPredicate() &&
-          !OpInfo[2].isOptionalDef())
-         && "Invalid arguments");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRt(insn))));
-  MI.addOperand(MCOperand::CreateReg(ARM::SP));
-  MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn)));
-  NumOpsAdded = 3;
-  return true;
-}
-
-// tPUSH, tPOP: Pred-Imm Pred-CCR register_list
-//
-// where register_list = low registers + [lr] for PUSH or
-//                       low registers + [pc] for POP
-//
-// "low registers" is specified by Inst{7-0}
-// lr|pc is specified by Inst{8}
-static bool DisassembleThumb1PushPop(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert((Opcode == ARM::tPUSH || Opcode == ARM::tPOP) && "Unexpected opcode");
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-    OpIdx += 2;
-  else {
-    DEBUG(errs() << "Expected predicate operands not found.\n");
-    return false;
-  }
-
-  unsigned RegListBits = slice(insn, 8, 8) << (Opcode == ARM::tPUSH ? 14 : 15)
-    | slice(insn, 7, 0);
-
-  // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 16; ++i) {
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         i)));
-      ++OpIdx;
-    }
-  }
-
-  return true;
-}
-
-// A6.2.5 Miscellaneous 16-bit instructions
-// Delegate to DisassembleThumb1PushPop() for tPUSH & tPOP.
-//
-// tADDspi, tSUBspi: ARM::SP ARM::SP(TIED_TO) imm7
-// t2IT:             firstcond=Inst{7-4} mask=Inst{3-0}
-// tCBNZ, tCBZ:      tRd imm6*2
-// tBKPT:            imm8
-// tNOP, tSEV, tYIELD, tWFE, tWFI:
-//   no operand (except predicate pair)
-// tSETENDBE, tSETENDLE, :
-//   no operand
-// Others:           tRd tRn
-static bool DisassembleThumb1Misc(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (NumOps == 0)
-    return true;
-
-  if (Opcode == ARM::tPUSH || Opcode == ARM::tPOP)
-    return DisassembleThumb1PushPop(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  // Predicate operands are handled elsewhere.
-  if (NumOps == 2 &&
-      OpInfo[0].isPredicate() && OpInfo[1].isPredicate() &&
-      OpInfo[0].RegClass < 0 && OpInfo[1].RegClass == ARM::CCRRegClassID) {
-    return true;
-  }
-
-  if (Opcode == ARM::tADDspi || Opcode == ARM::tSUBspi) {
-    // Special case handling for tADDspi and tSUBspi.
-    // A8.6.8 ADD (SP plus immediate) & A8.6.215 SUB (SP minus immediate)
-    MI.addOperand(MCOperand::CreateReg(ARM::SP));
-    MI.addOperand(MCOperand::CreateReg(ARM::SP));
-    MI.addOperand(MCOperand::CreateImm(getT1Imm7(insn)));
-    NumOpsAdded = 3;
-    return true;
-  }
-
-  if (Opcode == ARM::t2IT) {
-    // Special case handling for If-Then.
-    // A8.6.50 IT
-    // Tag the (firstcond[0] bit << 4) along with mask.
-
-    // firstcond
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 4)));
-
-    // firstcond[0] and mask
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));
-    NumOpsAdded = 2;
-    return true;
-  }
-
-  if (Opcode == ARM::tBKPT) {
-    MI.addOperand(MCOperand::CreateImm(getT1Imm8(insn))); // breakpoint value
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // CPS has a singleton $opt operand that contains the following information:
-  // The first op would be 0b10 as enable and 0b11 as disable in regular ARM,
-  // but in Thumb it's is 0 as enable and 1 as disable. So map it to ARM's
-  // default one. The second get the AIF flags from Inst{2-0}.
-  if (Opcode == ARM::tCPS) {
-    MI.addOperand(MCOperand::CreateImm(2 + slice(insn, 4, 4)));
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 2, 0)));
-    NumOpsAdded = 2;
-    return true;
-  }
-
-  assert(NumOps >= 2 && OpInfo[0].RegClass == ARM::tGPRRegClassID &&
-         (OpInfo[1].RegClass < 0 || OpInfo[1].RegClass==ARM::tGPRRegClassID)
-         && "Expect >=2 operands");
-
-  // Add the destination operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                     getT1tRd(insn))));
-
-  if (OpInfo[1].RegClass == ARM::tGPRRegClassID) {
-    // Two register instructions.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                       getT1tRn(insn))));
-  } else {
-    // CBNZ, CBZ
-    assert((Opcode == ARM::tCBNZ || Opcode == ARM::tCBZ) &&"Unexpected opcode");
-    MI.addOperand(MCOperand::CreateImm(getT1Imm6(insn) * 2));
-  }
-
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-// A8.6.53  LDM / LDMIA
-// A8.6.189 STM / STMIA
-//
-// tLDMIA_UPD/tSTMIA_UPD: tRt tRt AM4ModeImm Pred-Imm Pred-CCR register_list
-// tLDMIA:                tRt AM4ModeImm Pred-Imm Pred-CCR register_list
-static bool DisassembleThumb1LdStMul(bool Ld, MCInst &MI, unsigned Opcode,
-                                     uint32_t insn, unsigned short NumOps,
-                                     unsigned &NumOpsAdded, BO B) {
-  assert((Opcode == ARM::tLDMIA || Opcode == ARM::tLDMIA_UPD ||
-          Opcode == ARM::tSTMIA_UPD) && "Unexpected opcode");
-
-  unsigned tRt = getT1tRt(insn);
-  NumOpsAdded = 0;
-
-  // WB register, if necessary.
-  if (Opcode == ARM::tLDMIA_UPD || Opcode == ARM::tSTMIA_UPD) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       tRt)));
-    ++NumOpsAdded;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     tRt)));
-  ++NumOpsAdded;
-
-  // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) {
-    NumOpsAdded += 2;
-  } else {
-    DEBUG(errs() << "Expected predicate operands not found.\n");
-    return false;
-  }
-
-  unsigned RegListBits = slice(insn, 7, 0);
-  if (BitCount(RegListBits) < 1) {
-    DEBUG(errs() << "if BitCount(registers) < 1 then UNPREDICTABLE\n");
-    return false;
-  }
-
-  // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 8; ++i)
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::tGPRRegClassID,
-                                                         i)));
-      ++NumOpsAdded;
-    }
-
-  return true;
-}
-
-static bool DisassembleThumb1LdMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleThumb1LdStMul(true, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  B);
-}
-
-static bool DisassembleThumb1StMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  return DisassembleThumb1LdStMul(false, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                  B);
-}
-
-// A8.6.16 B Encoding T1
-// cond = Inst{11-8} & imm8 = Inst{7-0}
-// imm32 = SignExtend(imm8:'0', 32)
-//
-// tBcc: offset Pred-Imm Pred-CCR
-// tSVC: imm8 Pred-Imm Pred-CCR
-// tTRAP: 0 operand (early return)
-static bool DisassembleThumb1CondBr(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  if (Opcode == ARM::tTRAP)
-    return true;
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps == 3 && OpInfo[0].RegClass < 0 &&
-         OpInfo[1].isPredicate() && OpInfo[2].RegClass == ARM::CCRRegClassID
-         && "Exactly 3 operands expected");
-
-  unsigned Imm8 = getT1Imm8(insn);
-  MI.addOperand(MCOperand::CreateImm(
-                  Opcode == ARM::tBcc ? SignExtend32<9>(Imm8 << 1)
-                                      : (int)Imm8));
-
-  // Predicate operands by ARMBasicMCBuilder::TryPredicateAndSBitModifier().
-  // But note that for tBcc, if cond = '1110' then UNDEFINED.
-  if (Opcode == ARM::tBcc && slice(insn, 11, 8) == 14) {
-    DEBUG(errs() << "if cond = '1110' then UNDEFINED\n");
-    return false;
-  }
-  NumOpsAdded = 1;
-
-  return true;
-}
-
-// A8.6.16 B Encoding T2
-// imm11 = Inst{10-0}
-// imm32 = SignExtend(imm11:'0', 32)
-//
-// tB: offset
-static bool DisassembleThumb1Br(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps == 1 && OpInfo[0].RegClass < 0 && "1 imm operand expected");
-
-  unsigned Imm11 = getT1Imm11(insn);
-
-  MI.addOperand(MCOperand::CreateImm(SignExtend32<12>(Imm11 << 1)));
-
-  NumOpsAdded = 1;
-
-  return true;
-
-}
-
-// See A6.2 16-bit Thumb instruction encoding for instruction classes
-// corresponding to op.
-//
-// Table A6-1 16-bit Thumb instruction encoding (abridged)
-// op    Instruction or instruction class
-// ------  --------------------------------------------------------------------
-// 00xxxx  Shift (immediate), add, subtract, move, and compare on page A6-7
-// 010000  Data-processing on page A6-8
-// 010001  Special data instructions and branch and exchange on page A6-9
-// 01001x  Load from Literal Pool, see LDR (literal) on page A8-122
-// 0101xx  Load/store single data item on page A6-10
-// 011xxx
-// 100xxx
-// 10100x  Generate PC-relative address, see ADR on page A8-32
-// 10101x  Generate SP-relative address, see ADD (SP plus immediate) on
-//         page A8-28
-// 1011xx  Miscellaneous 16-bit instructions on page A6-11
-// 11000x  Store multiple registers, see STM / STMIA / STMEA on page A8-374
-// 11001x  Load multiple registers, see LDM / LDMIA / LDMFD on page A8-110 a
-// 1101xx  Conditional branch, and Supervisor Call on page A6-13
-// 11100x  Unconditional Branch, see B on page A8-44
-//
-static bool DisassembleThumb1(uint16_t op, MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  unsigned op1 = slice(op, 5, 4);
-  unsigned op2 = slice(op, 3, 2);
-  unsigned op3 = slice(op, 1, 0);
-  unsigned opA = slice(op, 5, 2);
-  switch (op1) {
-  case 0:
-    // A6.2.1 Shift (immediate), add, subtract, move, and compare
-    return DisassembleThumb1General(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-  case 1:
-    switch (op2) {
-    case 0:
-      switch (op3) {
-      case 0:
-        // A6.2.2 Data-processing
-        return DisassembleThumb1DP(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      case 1:
-        // A6.2.3 Special data instructions and branch and exchange
-        return DisassembleThumb1Special(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                        B);
-      default:
-        // A8.6.59 LDR (literal)
-        return DisassembleThumb1LdPC(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      }
-      break;
-    default:
-      // A6.2.4 Load/store single data item
-      return DisassembleThumb1LdSt(opA, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                   B);
-      break;
-    }
-    break;
-  case 2:
-    switch (op2) {
-    case 0:
-      // A6.2.4 Load/store single data item
-      return DisassembleThumb1LdSt(opA, MI, Opcode, insn, NumOps, NumOpsAdded,
-                                   B);
-    case 1:
-      // A6.2.4 Load/store single data item
-      return DisassembleThumb1LdStSP(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    case 2:
-      if (op3 <= 1) {
-        // A8.6.10 ADR
-        return DisassembleThumb1AddPCi(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-      } else {
-        // A8.6.8 ADD (SP plus immediate)
-        return DisassembleThumb1AddSPi(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-      }
-    default:
-      // A6.2.5 Miscellaneous 16-bit instructions
-      return DisassembleThumb1Misc(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    }
-    break;
-  case 3:
-    switch (op2) {
-    case 0:
-      if (op3 <= 1) {
-        // A8.6.189 STM / STMIA / STMEA
-        return DisassembleThumb1StMul(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      } else {
-        // A8.6.53 LDM / LDMIA / LDMFD
-        return DisassembleThumb1LdMul(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      }
-    case 1:
-      // A6.2.6 Conditional branch, and Supervisor Call
-      return DisassembleThumb1CondBr(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    case 2:
-      // Unconditional Branch, see B on page A8-44
-      return DisassembleThumb1Br(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    default:
-      assert(0 && "Unreachable code");
-      break;
-    }
-    break;
-  default:
-    assert(0 && "Unreachable code");
-    break;
-  }
-
-  return false;
-}
-
-///////////////////////////////////////////////
-//                                           //
-// Thumb2 instruction disassembly functions. //
-//                                           //
-///////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////
-//                                                       //
-// Note: the register naming follows the ARM convention! //
-//                                                       //
-///////////////////////////////////////////////////////////
-
-static inline bool Thumb2SRSOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2SRSDBW: case ARM::t2SRSDB:
-  case ARM::t2SRSIAW: case ARM::t2SRSIA:
-    return true;
-  }
-}
-
-static inline bool Thumb2RFEOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2RFEDBW: case ARM::t2RFEDB:
-  case ARM::t2RFEIAW: case ARM::t2RFEIA:
-    return true;
-  }
-}
-
-// t2SRS[IA|DB]W/t2SRS[IA|DB]: mode_imm = Inst{4-0}
-static bool DisassembleThumb2SRS(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded) {
-  MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));
-  NumOpsAdded = 1;
-  return true;
-}
-
-// t2RFE[IA|DB]W/t2RFE[IA|DB]: Rn
-static bool DisassembleThumb2RFE(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-  unsigned Rn = decodeRn(insn);
-  if (Rn == 15) {
-    DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
-    return false;
-  }
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,ARM::GPRRegClassID,Rn)));
-  NumOpsAdded = 1;
-  return true;
-}
-
-static bool DisassembleThumb2LdStMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (Thumb2SRSOpcode(Opcode))
-    return DisassembleThumb2SRS(MI, Opcode, insn, NumOps, NumOpsAdded);
-
-  if (Thumb2RFEOpcode(Opcode))
-    return DisassembleThumb2RFE(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  assert((Opcode == ARM::t2LDMIA || Opcode == ARM::t2LDMIA_UPD ||
-          Opcode == ARM::t2LDMDB || Opcode == ARM::t2LDMDB_UPD ||
-          Opcode == ARM::t2STMIA || Opcode == ARM::t2STMIA_UPD ||
-          Opcode == ARM::t2STMDB || Opcode == ARM::t2STMDB_UPD)
-         && "Unexpected opcode");
-  assert(NumOps >= 4 && "Thumb2 LdStMul expects NumOps >= 4");
-
-  NumOpsAdded = 0;
-
-  unsigned Base = getRegisterEnum(B, ARM::GPRRegClassID, decodeRn(insn));
-
-  // Writeback to base.
-  if (Opcode == ARM::t2LDMIA_UPD || Opcode == ARM::t2LDMDB_UPD ||
-      Opcode == ARM::t2STMIA_UPD || Opcode == ARM::t2STMDB_UPD) {
-    MI.addOperand(MCOperand::CreateReg(Base));
-    ++NumOpsAdded;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(Base));
-  ++NumOpsAdded;
-
-  // Handling the two predicate operands before the reglist.
-  if (B->DoPredicateOperands(MI, Opcode, insn, NumOps)) {
-    NumOpsAdded += 2;
-  } else {
-    DEBUG(errs() << "Expected predicate operands not found.\n");
-    return false;
-  }
-
-  unsigned RegListBits = insn & ((1 << 16) - 1);
-
-  // Fill the variadic part of reglist.
-  for (unsigned i = 0; i < 16; ++i)
-    if ((RegListBits >> i) & 1) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                         i)));
-      ++NumOpsAdded;
-    }
-
-  return true;
-}
-
-// t2LDREX: Rd Rn
-// t2LDREXD: Rd Rs Rn
-// t2LDREXB, t2LDREXH: Rd Rn
-// t2STREX: Rs Rd Rn
-// t2STREXD: Rm Rd Rs Rn
-// t2STREXB, t2STREXH: Rm Rd Rn
-static bool DisassembleThumb2LdStEx(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && OpInfo[0].RegClass > 0
-         && OpInfo[1].RegClass > 0
-         && "Expect >=2 operands and first two as reg operands");
-
-  bool isStore = (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH);
-  bool isSW = (Opcode == ARM::t2LDREX || Opcode == ARM::t2STREX);
-  bool isDW = (Opcode == ARM::t2LDREXD || Opcode == ARM::t2STREXD);
-
-  unsigned Rt  = decodeRd(insn);
-  unsigned Rt2 = decodeRs(insn); // But note that this is Rd for t2STREX.
-  unsigned Rd  = decodeRm(insn);
-  unsigned Rn  = decodeRn(insn);
-
-  // Some sanity checking first.
-  if (isStore) {
-    // if d == n || d == t then UNPREDICTABLE
-    // if d == n || d == t || d == t2 then UNPREDICTABLE
-    if (isDW) {
-      if (Rd == Rn || Rd == Rt || Rd == Rt2) {
-        DEBUG(errs() << "if d == n || d == t || d == t2 then UNPREDICTABLE\n");
-        return false;
-      }
-    } else {
-      if (isSW) {
-        if (Rt2 == Rn || Rt2 == Rt) {
-          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
-          return false;
-        }
-      } else {
-        if (Rd == Rn || Rd == Rt) {
-          DEBUG(errs() << "if d == n || d == t then UNPREDICTABLE\n");
-          return false;
-        }
-      }
-    }
-  } else {
-    // Load
-    // A8.6.71 LDREXD
-    // if t == t2 then UNPREDICTABLE
-    if (isDW && Rt == Rt2) {
-      DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
-      return false;
-    }
-  }
-
-  // Add the destination operand for store.
-  if (isStore) {
-    MI.addOperand(MCOperand::CreateReg(
-                    getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                    isSW ? Rt2 : Rd)));
-    ++OpIdx;
-  }
-
-  // Source operand for store and destination operand for load.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     Rt)));
-  ++OpIdx;
-
-  // Thumb2 doubleword complication: with an extra source/destination operand.
-  if (isDW) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
-                                                       Rt2)));
-    ++OpIdx;
-  }
-
-  // Finally add the pointer operand.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     Rn)));
-  ++OpIdx;
-
-  return true;
-}
-
-// t2LDRDi8: Rd Rs Rn imm8s4 (offset mode)
-// t2LDRDpci: Rd Rs imm8s4 (Not decoded, prefer the generic t2LDRDi8 version)
-// t2STRDi8: Rd Rs Rn imm8s4 (offset mode)
-//
-// Ditto for t2LDRD_PRE, t2LDRD_POST, t2STRD_PRE, t2STRD_POST, which are for
-// disassembly only and do not have a tied_to writeback base register operand.
-static bool DisassembleThumb2LdStDual(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-  if (!OpInfo) return false;
-
-  assert(NumOps >= 4
-         && OpInfo[0].RegClass > 0
-         && OpInfo[0].RegClass == OpInfo[1].RegClass
-         && OpInfo[2].RegClass > 0
-         && OpInfo[3].RegClass < 0
-         && "Expect >= 4 operands and first 3 as reg operands");
-
-  // Thumnb allows for specifying Rt and Rt2, unlike ARM (which has Rt2==Rt+1).
-  unsigned Rt  = decodeRd(insn);
-  unsigned Rt2 = decodeRs(insn);
-  unsigned Rn  = decodeRn(insn);
-
-  // Some sanity checking first.
-
-  // A8.6.67 LDRD (literal) has its W bit as (0).
-  if (Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST) {
-    if (Rn == 15 && slice(insn, 21, 21) != 0)
-      return false;
-  } else {
-    // For Dual Store, PC cannot be used as the base register.
-    if (Rn == 15) {
-      DEBUG(errs() << "if n == 15 then UNPREDICTABLE\n");
-      return false;
-    }
-  }
-  if (Rt == Rt2) {
-    DEBUG(errs() << "if t == t2 then UNPREDICTABLE\n");
-    return false;
-  }
-  if (Opcode != ARM::t2LDRDi8 && Opcode != ARM::t2STRDi8) {
-    if (Rn == Rt || Rn == Rt2) {
-      DEBUG(errs() << "if wback && (n == t || n == t2) then UNPREDICTABLE\n");
-      return false;
-    }
-  }
-
-  // Add the <Rt> <Rt2> operands.
-  unsigned RegClassPair = OpInfo[0].RegClass;
-  unsigned RegClassBase = OpInfo[2].RegClass;
-  
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair,
-                                                     decodeRd(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassPair,
-                                                     decodeRs(insn))));
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RegClassBase,
-                                                     decodeRn(insn))));
-
-  // Finally add (+/-)imm8*4, depending on the U bit.
-  int Offset = getImm8(insn) * 4;
-  if (getUBit(insn) == 0)
-    Offset = -Offset;
-  MI.addOperand(MCOperand::CreateImm(Offset));
-  NumOpsAdded = 4;
-
-  return true;
-}
-
-// t2TBB, t2TBH: Rn Rm Pred-Imm Pred-CCR
-static bool DisassembleThumb2TB(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  assert(NumOps >= 2 && "Expect >= 2 operands");
-
-  // The generic version of TBB/TBH needs a base register.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  // Add the index register.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRm(insn))));
-  NumOpsAdded = 2;
-
-  return true;
-}
-
-static inline bool Thumb2ShiftOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2MOVCClsl: case ARM::t2MOVCClsr:
-  case ARM::t2MOVCCasr: case ARM::t2MOVCCror:
-  case ARM::t2LSLri:    case ARM::t2LSRri:
-  case ARM::t2ASRri:    case ARM::t2RORri:
-    return true;
-  }
-}
-
-// A6.3.11 Data-processing (shifted register)
-//
-// Two register operands (Rn=0b1111 no 1st operand reg): Rs Rm
-// Two register operands (Rs=0b1111 no dst operand reg): Rn Rm
-// Three register operands: Rs Rn Rm
-// Three register operands: (Rn=0b1111 Conditional Move) Rs Ro(TIED_TO) Rm
-//
-// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
-// register with shift forms: (Rm, ConstantShiftSpecifier).
-// Constant shift specifier: Imm = (ShOp | ShAmt<<3).
-//
-// There are special instructions, like t2MOVsra_flag and t2MOVsrl_flag, which
-// only require two register operands: Rd, Rm in ARM Reference Manual terms, and
-// nothing else, because the shift amount is already specified.
-// Similar case holds for t2MOVrx, t2ADDrr, ..., etc.
-static bool DisassembleThumb2DPSoReg(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  // Special case handling.
-  if (Opcode == ARM::t2BR_JT) {
-    assert(NumOps == 4
-           && OpInfo[0].RegClass == ARM::GPRRegClassID
-           && OpInfo[1].RegClass == ARM::GPRRegClassID
-           && OpInfo[2].RegClass < 0
-           && OpInfo[3].RegClass < 0
-           && "Exactly 4 operands expect and first two as reg operands");
-    // Only need to populate the src reg operand.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-    MI.addOperand(MCOperand::CreateReg(0));
-    MI.addOperand(MCOperand::CreateImm(0));
-    MI.addOperand(MCOperand::CreateImm(0));
-    NumOpsAdded = 4;
-    return true;
-  }
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2
-         && (OpInfo[0].RegClass == ARM::GPRRegClassID ||
-             OpInfo[0].RegClass == ARM::rGPRRegClassID)
-         && (OpInfo[1].RegClass == ARM::GPRRegClassID ||
-             OpInfo[1].RegClass == ARM::rGPRRegClassID)
-         && "Expect >= 2 operands and first two as reg operands");
-
-  bool ThreeReg = (NumOps > 2 && (OpInfo[2].RegClass == ARM::GPRRegClassID ||
-                                  OpInfo[2].RegClass == ARM::rGPRRegClassID));
-  bool NoDstReg = (decodeRs(insn) == 0xF);
-
-  // Build the register operands, followed by the constant shift specifier.
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, OpInfo[0].RegClass,
-                                  NoDstReg ? decodeRn(insn) : decodeRs(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    int Idx;
-    if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-      // Process tied_to operand constraint.
-      MI.addOperand(MI.getOperand(Idx));
-      ++OpIdx;
-    } else if (!NoDstReg) {
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[1].RegClass,
-                                                         decodeRn(insn))));
-      ++OpIdx;
-    } else {
-      DEBUG(errs() << "Thumb2 encoding error: d==15 for three-reg operands.\n");
-      return false;
-    }
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  if (NumOps == OpIdx)
-    return true;
-
-  if (OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-      && !OpInfo[OpIdx].isOptionalDef()) {
-
-    if (Thumb2ShiftOpcode(Opcode)) {
-      unsigned Imm = getShiftAmtBits(insn);
-      ARM_AM::ShiftOpc ShOp = getShiftOpcForBits(slice(insn, 5, 4));
-      getImmShiftSE(ShOp, Imm);
-      MI.addOperand(MCOperand::CreateImm(Imm));
-    } else {
-      // Build the constant shift specifier operand.
-      unsigned bits2 = getShiftTypeBits(insn);
-      unsigned imm5 = getShiftAmtBits(insn);
-      ARM_AM::ShiftOpc ShOp = ARM_AM::no_shift;
-      unsigned ShAmt = decodeImmShift(bits2, imm5, ShOp);
-      MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(ShOp, ShAmt)));
-    }
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.3.1 Data-processing (modified immediate)
-//
-// Two register operands: Rs Rn ModImm
-// One register operands (Rs=0b1111 no explicit dest reg): Rn ModImm
-// One register operands (Rn=0b1111 no explicit src reg): Rs ModImm -
-// {t2MOVi, t2MVNi}
-//
-// ModImm = ThumbExpandImm(i:imm3:imm8)
-static bool DisassembleThumb2DPModImm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RdRegClassID = OpInfo[0].RegClass;
-  assert(NumOps >= 2 && (RdRegClassID == ARM::GPRRegClassID ||
-                         RdRegClassID == ARM::rGPRRegClassID)
-         && "Expect >= 2 operands and first one as reg operand");
-
-  unsigned RnRegClassID = OpInfo[1].RegClass;
-  bool TwoReg = (RnRegClassID == ARM::GPRRegClassID
-                 || RnRegClassID == ARM::rGPRRegClassID);
-  bool NoDstReg = (decodeRs(insn) == 0xF);
-
-  // Build the register operands, followed by the modified immediate.
-
-  MI.addOperand(MCOperand::CreateReg(
-                  getRegisterEnum(B, RdRegClassID,
-                                  NoDstReg ? decodeRn(insn) : decodeRs(insn))));
-  ++OpIdx;
-
-  if (TwoReg) {
-    if (NoDstReg) {
-      DEBUG(errs()<<"Thumb2 encoding error: d==15 for DPModImm 2-reg instr.\n");
-      return false;
-    }
-    int Idx;
-    if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-      // The reg operand is tied to the first reg operand.
-      MI.addOperand(MI.getOperand(Idx));
-    } else {
-      // Add second reg operand.
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                         decodeRn(insn))));
-    }
-    ++OpIdx;
-  }
-
-  // The modified immediate operand should come next.
-  assert(OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
-         !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()
-         && "Pure imm operand expected");
-
-  // i:imm3:imm8
-  // A6.3.2 Modified immediate constants in Thumb instructions
-  unsigned imm12 = getIImm3Imm8(insn);
-  MI.addOperand(MCOperand::CreateImm(ThumbExpandImm(imm12)));
-  ++OpIdx;
-
-  return true;
-}
-
-static inline bool Thumb2SaturateOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  case ARM::t2SSAT: case ARM::t2SSAT16:
-  case ARM::t2USAT: case ARM::t2USAT16:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// DisassembleThumb2Sat - Disassemble Thumb2 saturate instructions:
-/// o t2SSAT, t2USAT: Rs sat_pos Rn shamt
-/// o t2SSAT16, t2USAT16: Rs sat_pos Rn
-static bool DisassembleThumb2Sat(MCInst &MI, unsigned Opcode, uint32_t insn,
-                                 unsigned &NumOpsAdded, BO B) {
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  NumOpsAdded = MCID.getNumOperands() - 2; // ignore predicate operands
-
-  // Disassemble the register def.
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRs(insn))));
-
-  unsigned Pos = slice(insn, 4, 0);
-  if (Opcode == ARM::t2SSAT || Opcode == ARM::t2SSAT16)
-    Pos += 1;
-  MI.addOperand(MCOperand::CreateImm(Pos));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRn(insn))));
-
-  if (NumOpsAdded == 4) {
-    ARM_AM::ShiftOpc Opc = (slice(insn, 21, 21) != 0 ?
-                            ARM_AM::asr : ARM_AM::lsl);
-    // Inst{14-12:7-6} encodes the imm5 shift amount.
-    unsigned ShAmt = slice(insn, 14, 12) << 2 | slice(insn, 7, 6);
-    if (ShAmt == 0) {
-      if (Opc == ARM_AM::asr)
-        ShAmt = 32;
-      else
-        Opc = ARM_AM::no_shift;
-    }
-    MI.addOperand(MCOperand::CreateImm(ARM_AM::getSORegOpc(Opc, ShAmt)));
-  }
-  return true;
-}
-
-// A6.3.3 Data-processing (plain binary immediate)
-//
-// o t2ADDri12, t2SUBri12: Rs Rn imm12
-// o t2LEApcrel (ADR): Rs imm12
-// o t2BFC (BFC): Rs Ro(TIED_TO) bf_inv_mask_imm
-// o t2BFI (BFI): Rs Ro(TIED_TO) Rn bf_inv_mask_imm
-// o t2MOVi16: Rs imm16
-// o t2MOVTi16: Rs imm16
-// o t2SBFX (SBFX): Rs Rn lsb width
-// o t2UBFX (UBFX): Rs Rn lsb width
-// o t2BFI (BFI): Rs Rn lsb width
-static bool DisassembleThumb2DPBinImm(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  unsigned RdRegClassID = OpInfo[0].RegClass;
-  assert(NumOps >= 2 && (RdRegClassID == ARM::GPRRegClassID ||
-                         RdRegClassID == ARM::rGPRRegClassID)
-         && "Expect >= 2 operands and first one as reg operand");
-
-  unsigned RnRegClassID = OpInfo[1].RegClass;
-  bool TwoReg = (RnRegClassID == ARM::GPRRegClassID
-                 || RnRegClassID == ARM::rGPRRegClassID);
-
-  // Build the register operand(s), followed by the immediate(s).
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RdRegClassID,
-                                                     decodeRs(insn))));
-  ++OpIdx;
-
-  if (TwoReg) {
-    assert(NumOps >= 3 && "Expect >= 3 operands");
-    int Idx;
-    if ((Idx = MCID.getOperandConstraint(OpIdx, MCOI::TIED_TO)) != -1) {
-      // Process tied_to operand constraint.
-      MI.addOperand(MI.getOperand(Idx));
-    } else {
-      // Add src reg operand.
-      MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                         decodeRn(insn))));
-    }
-    ++OpIdx;
-  }
-
-  if (Opcode == ARM::t2BFI) {
-    // Add val reg operand.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, RnRegClassID,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-         && !OpInfo[OpIdx].isOptionalDef()
-         && "Pure imm operand expected");
-
-  // Pre-increment OpIdx.
-  ++OpIdx;
-
-  if (Opcode == ARM::t2ADDri12 || Opcode == ARM::t2SUBri12
-      || Opcode == ARM::t2LEApcrel)
-    MI.addOperand(MCOperand::CreateImm(getIImm3Imm8(insn)));
-  else if (Opcode == ARM::t2MOVi16 || Opcode == ARM::t2MOVTi16) {
-    if (!B->tryAddingSymbolicOperand(getImm16(insn), 4, MI))
-      MI.addOperand(MCOperand::CreateImm(getImm16(insn)));
-  } else if (Opcode == ARM::t2BFC || Opcode == ARM::t2BFI) {
-    uint32_t mask = 0;
-    if (getBitfieldInvMask(insn, mask))
-      MI.addOperand(MCOperand::CreateImm(mask));
-    else
-      return false;
-  } else {
-    // Handle the case of: lsb width
-    assert((Opcode == ARM::t2SBFX || Opcode == ARM::t2UBFX)
-            && "Unexpected opcode");
-    MI.addOperand(MCOperand::CreateImm(getLsb(insn)));
-    MI.addOperand(MCOperand::CreateImm(getWidthMinus1(insn) + 1));
-
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.3.4 Table A6-15 Miscellaneous control instructions
-// A8.6.41 DMB
-// A8.6.42 DSB
-// A8.6.49 ISB
-static inline bool t2MiscCtrlInstr(uint32_t insn) {
-  if (slice(insn, 31, 20) == 0xf3b && slice(insn, 15, 14) == 2 &&
-      slice(insn, 12, 12) == 0)
-    return true;
-
-  return false;
-}
-
-// A6.3.4 Branches and miscellaneous control
-//
-// A8.6.16 B
-// Branches: t2B, t2Bcc -> imm operand
-//
-// Branches: t2TPsoft -> no operand
-//
-// A8.6.23 BL, BLX (immediate)
-// Branches (defined in ARMInstrThumb.td): tBLr9, tBLXi_r9 -> imm operand
-//
-// A8.6.26
-// t2BXJ -> Rn
-//
-// Miscellaneous control:
-//   -> no operand (except pred-imm pred-ccr for CLREX, memory barrier variants)
-//
-// Hint: t2NOP, t2YIELD, t2WFE, t2WFI, t2SEV
-//   -> no operand (except pred-imm pred-ccr)
-//
-// t2DBG -> imm4 = Inst{3-0}
-//
-// t2MRS/t2MRSsys -> Rs
-// t2MSR/t2MSRsys -> Rn mask=Inst{11-8}
-// t2SMC -> imm4 = Inst{19-16}
-static bool DisassembleThumb2BrMiscCtrl(MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  if (NumOps == 0)
-    return true;
-
-  if (Opcode == ARM::t2DMB || Opcode == ARM::t2DSB) {
-    // Inst{3-0} encodes the memory barrier option for the variants.
-    unsigned opt = slice(insn, 3, 0);
-    switch (opt) {
-    case ARM_MB::SY:  case ARM_MB::ST:
-    case ARM_MB::ISH: case ARM_MB::ISHST:
-    case ARM_MB::NSH: case ARM_MB::NSHST:
-    case ARM_MB::OSH: case ARM_MB::OSHST:
-      MI.addOperand(MCOperand::CreateImm(opt));
-      NumOpsAdded = 1;
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  if (t2MiscCtrlInstr(insn))
-    return true;
-
-  switch (Opcode) {
-  case ARM::t2CLREX:
-  case ARM::t2NOP:
-  case ARM::t2YIELD:
-  case ARM::t2WFE:
-  case ARM::t2WFI:
-  case ARM::t2SEV:
-    return true;
-  default:
-    break;
-  }
-
-  // FIXME: To enable correct asm parsing and disasm of CPS we need 3 different
-  // opcodes which match the same real instruction. This is needed since there's
-  // no current handling of optional arguments. Fix here when a better handling
-  // of optional arguments is implemented.
-  if (Opcode == ARM::t2CPS3p) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5)));  // iflags
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0)));  // mode
-    NumOpsAdded = 3;
-    return true;
-  }
-  if (Opcode == ARM::t2CPS2p) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 10, 9))); // imod
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 7, 5)));  // iflags
-    NumOpsAdded = 2;
-    return true;
-  }
-  if (Opcode == ARM::t2CPS1p) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 4, 0))); // mode
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // DBG has its option specified in Inst{3-0}.
-  if (Opcode == ARM::t2DBG) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 3, 0)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // MRS and MRSsys take one GPR reg Rs.
-  if (Opcode == ARM::t2MRS || Opcode == ARM::t2MRSsys) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRs(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // BXJ takes one GPR reg Rn.
-  if (Opcode == ARM::t2BXJ) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    NumOpsAdded = 1;
-    return true;
-  }
-  // MSR take a mask, followed by one GPR reg Rn. The mask contains the R Bit in
-  // bit 4, and the special register fields in bits 3-0.
-  if (Opcode == ARM::t2MSR) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 20, 20) << 4 /* R Bit */ |
-                                       slice(insn, 11, 8) /* Special Reg */));
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRn(insn))));
-    NumOpsAdded = 2;
-    return true;
-  }
-  // SMC take imm4.
-  if (Opcode == ARM::t2SMC) {
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 19, 16)));
-    NumOpsAdded = 1;
-    return true;
-  }
-
-  // Some instructions have predicate operands first before the immediate.
-  if (Opcode == ARM::tBLXi_r9 || Opcode == ARM::tBLr9) {
-    // Handling the two predicate operands before the imm operand.
-    if (B->DoPredicateOperands(MI, Opcode, insn, NumOps))
-      NumOpsAdded += 2;
-    else {
-      DEBUG(errs() << "Expected predicate operands not found.\n");
-      return false;
-    }
-  }
-
-  // Add the imm operand.
-  int Offset = 0;
-
-  switch (Opcode) {
-  default:
-    assert(0 && "Unexpected opcode");
-    return false;
-  case ARM::t2B:
-    Offset = decodeImm32_B_EncodingT4(insn);
-    break;
-  case ARM::t2Bcc:
-    Offset = decodeImm32_B_EncodingT3(insn);
-    break;
-  case ARM::tBLr9:
-    Offset = decodeImm32_BL(insn);
-    break;
-  case ARM::tBLXi_r9:
-    Offset = decodeImm32_BLX(insn);
-    break;
-  }
-
-  if (!B->tryAddingSymbolicOperand(Offset + B->getBuilderAddress() + 4, 4, MI))
-    MI.addOperand(MCOperand::CreateImm(Offset));
-
-  // This is an increment as some predicate operands may have been added first.
-  NumOpsAdded += 1;
-
-  return true;
-}
-
-static inline bool Thumb2PreloadOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  default:
-    return false;
-  case ARM::t2PLDi12:   case ARM::t2PLDi8:
-  case ARM::t2PLDs:
-  case ARM::t2PLDWi12:  case ARM::t2PLDWi8:
-  case ARM::t2PLDWs:
-  case ARM::t2PLIi12:   case ARM::t2PLIi8:
-  case ARM::t2PLIs:
-    return true;
-  }
-}
-
-static bool DisassembleThumb2PreLoad(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  // Preload Data/Instruction requires either 2 or 3 operands.
-  // t2PLDi12, t2PLDi8, t2PLDpci: Rn [+/-]imm12/imm8
-  // t2PLDr:                      Rn Rm
-  // t2PLDs:                      Rn Rm imm2=Inst{5-4}
-  // Same pattern applies for t2PLDW* and t2PLI*.
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 &&
-         OpInfo[0].RegClass == ARM::GPRRegClassID &&
-         "Expect >= 2 operands and first one as reg operand");
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                     decodeRn(insn))));
-  ++OpIdx;
-
-  if (OpInfo[OpIdx].RegClass == ARM::rGPRRegClassID) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::GPRRegClassID,
-                                                       decodeRm(insn))));
-  } else {
-    assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-           && !OpInfo[OpIdx].isOptionalDef()
-           && "Pure imm operand expected");
-    int Offset = 0;
-    if (Opcode == ARM::t2PLDi8 || Opcode == ARM::t2PLDWi8 ||
-        Opcode == ARM::t2PLIi8) {
-      // A8.6.117 Encoding T2: add = FALSE
-      unsigned Imm8 = getImm8(insn);
-      Offset = -1 * Imm8;
-    } else {
-      // The i12 forms.  See, for example, A8.6.117 Encoding T1.
-      // Note that currently t2PLDi12 also handles the previously named t2PLDpci
-      // opcode, that's why we use decodeImm12(insn) which returns +/- imm12.
-      Offset = decodeImm12(insn);
-    }
-    MI.addOperand(MCOperand::CreateImm(Offset));
-  }
-  ++OpIdx;
-
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0 &&
-      !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Fills in the shift amount for t2PLDs, t2PLDWs, t2PLIs.
-    MI.addOperand(MCOperand::CreateImm(slice(insn, 5, 4)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-static bool BadRegsThumb2LdSt(unsigned Opcode, uint32_t insn, bool Load,
-      unsigned R0, unsigned R1, unsigned R2, bool UseRm, bool WB) {
-
-  // Inst{22-21} encodes the data item transferred for load/store.
-  // For single word, it is encoded as ob10.
-  bool Word = (slice(insn, 22, 21) == 2);
-  bool Half = (slice(insn, 22, 21) == 1);
-  bool Byte = (slice(insn, 22, 21) == 0);
-
-  if (UseRm && BadReg(R2)) {
-    DEBUG(errs() << "if BadReg(m) then UNPREDICTABLE\n");
-    return true;
-  }
-
-  if (Load) {
-    if (!Word && R0 == 13) {
-      DEBUG(errs() << "if t == 13 then UNPREDICTABLE\n");
-      return true;
-    }
-    if (Byte) {
-      if (WB && R0 == 15 && slice(insn, 10, 8) == 3)  {
-        // A8.6.78 LDRSB (immediate) Encoding T2 (errata markup 8.0)
-        DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
-        return true;
-      }
-    }
-    // A6.3.8 Load halfword, memory hints
-    if (Half) {
-      if (WB) {
-        if (R0 == R1)  {
-          // A8.6.82 LDRSH (immediate) Encoding T2
-          DEBUG(errs() << "if WB && n == t then UNPREDICTABLE\n");
-          return true;
-        }
-        if (R0 == 15 && slice(insn, 10, 8) == 3)  {
-          // A8.6.82 LDRSH (immediate) Encoding T2 (errata markup 8.0)
-          DEBUG(errs() << "if t == 15 && PUW == '011' then UNPREDICTABLE\n");
-          return true;
-        }
-      } else {
-        if (Opcode == ARM::t2LDRHi8 || Opcode == ARM::t2LDRSHi8) {
-          if (R0 == 15 && slice(insn, 10, 8) == 4) {
-            // A8.6.82 LDRSH (immediate) Encoding T2
-            DEBUG(errs() << "if Rt == '1111' and PUW == '100' then SEE"
-                         << " \"Unallocated memory hints\"\n");
-            return true;
-          }
-        } else {
-          if (R0 == 15) {
-            // A8.6.82 LDRSH (immediate) Encoding T1
-            DEBUG(errs() << "if Rt == '1111' then SEE"
-                         << " \"Unallocated memory hints\"\n");
-            return true;
-          }
-        }
-      }
-    }
-  } else {
-    if (WB && R0 == R1) {
-      DEBUG(errs() << "if wback && n == t then UNPREDICTABLE\n");
-      return true;
-    }
-    if ((WB && R0 == 15) || (!WB && R1 == 15)) {
-      DEBUG(errs() << "if Rn == '1111' then UNDEFINED\n");
-      return true;
-    }
-    if (Word) {
-      if ((WB && R1 == 15) || (!WB && R0 == 15)) {
-        DEBUG(errs() << "if t == 15 then UNPREDICTABLE\n");
-        return true;
-      }
-    } else {
-      if ((WB && BadReg(R1)) || (!WB && BadReg(R0))) {
-        DEBUG(errs() << "if BadReg(t) then UNPREDICTABLE\n");
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// A6.3.10 Store single data item
-// A6.3.9 Load byte, memory hints
-// A6.3.8 Load halfword, memory hints
-// A6.3.7 Load word
-//
-// For example,
-//
-// t2LDRi12:   Rd Rn (+)imm12
-// t2LDRi8:    Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2LDRs:     Rd Rn Rm ConstantShiftSpecifier (see also
-//             DisassembleThumb2DPSoReg)
-// t2LDR_POST: Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2LDR_PRE:  Rd Rn Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-//
-// t2STRi12:   Rd Rn (+)imm12
-// t2STRi8:    Rd Rn (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2STRs:     Rd Rn Rm ConstantShiftSpecifier (see also
-//             DisassembleThumb2DPSoReg)
-// t2STR_POST: Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-// t2STR_PRE:  Rn Rd Rn(TIED_TO) (+/-)imm8 (+ if Inst{9} == 0b1)
-//
-// Note that for indexed modes, the Rn(TIED_TO) operand needs to be populated
-// correctly, as LLVM AsmPrinter depends on it.  For indexed stores, the first
-// operand is Rn; for all the other instructions, Rd is the first operand.
-//
-// Delegates to DisassembleThumb2PreLoad() for preload data/instruction.
-// Delegates to DisassembleThumb2Ldpci() for load * literal operations.
-static bool DisassembleThumb2LdSt(bool Load, MCInst &MI, unsigned Opcode,
-    uint32_t insn, unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  unsigned Rn = decodeRn(insn);
-
-  if (Thumb2PreloadOpcode(Opcode))
-    return DisassembleThumb2PreLoad(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-
-  // See, for example, A6.3.7 Load word: Table A6-18 Load word.
-  if (Load && Rn == 15)
-    return DisassembleThumb2Ldpci(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass > 0 &&
-         OpInfo[1].RegClass > 0 &&
-         "Expect >= 3 operands and first two as reg operands");
-
-  bool ThreeReg = (OpInfo[2].RegClass > 0);
-  bool TIED_TO = ThreeReg && MCID.getOperandConstraint(2, MCOI::TIED_TO) != -1;
-  bool Imm12 = !ThreeReg && slice(insn, 23, 23) == 1; // ARMInstrThumb2.td
-
-  // Build the register operands, followed by the immediate.
-  unsigned R0 = 0, R1 = 0, R2 = 0;
-  unsigned Rd = decodeRd(insn);
-  int Imm = 0;
-
-  if (!Load && TIED_TO) {
-    R0 = Rn;
-    R1 = Rd;
-  } else {
-    R0 = Rd;
-    R1 = Rn;
-  }
-  if (ThreeReg) {
-    if (TIED_TO) {
-      R2 = Rn;
-      Imm = decodeImm8(insn);
-    } else {
-      R2 = decodeRm(insn);
-      // See, for example, A8.6.64 LDRB (register).
-      // And ARMAsmPrinter::printT2AddrModeSoRegOperand().
-      // LSL is the default shift opc, and LLVM does not expect it to be encoded
-      // as part of the immediate operand.
-      // Imm = ARM_AM::getSORegOpc(ARM_AM::lsl, slice(insn, 5, 4));
-      Imm = slice(insn, 5, 4);
-    }
-  } else {
-    if (Imm12)
-      Imm = getImm12(insn);
-    else
-      Imm = decodeImm8(insn);
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     R0)));
-  ++OpIdx;
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     R1)));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    // This could be an offset register or a TIED_TO register.
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
-                                                       R2)));
-    ++OpIdx;
-  }
-
-  if (BadRegsThumb2LdSt(Opcode, insn, Load, R0, R1, R2, ThreeReg & !TIED_TO,
-                        TIED_TO))
-    return false;
-
-  assert(OpInfo[OpIdx].RegClass < 0 && !OpInfo[OpIdx].isPredicate()
-         && !OpInfo[OpIdx].isOptionalDef()
-         && "Pure imm operand expected");
-
-  MI.addOperand(MCOperand::CreateImm(Imm));
-  ++OpIdx;
-
-  return true;
-}
-
-// A6.3.12 Data-processing (register)
-//
-// Two register operands [rotate]:   Rs Rm [rotation(= (rotate:'000'))]
-// Three register operands only:     Rs Rn Rm
-// Three register operands [rotate]: Rs Rn Rm [rotation(= (rotate:'000'))]
-//
-// Parallel addition and subtraction 32-bit Thumb instructions: Rs Rn Rm
-//
-// Miscellaneous operations: Rs [Rn] Rm
-static bool DisassembleThumb2DPReg(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCInstrDesc &MCID = ARMInsts[Opcode];
-  const MCOperandInfo *OpInfo = MCID.OpInfo;
-  unsigned &OpIdx = NumOpsAdded;
-
-  OpIdx = 0;
-
-  assert(NumOps >= 2 &&
-         OpInfo[0].RegClass > 0 &&
-         OpInfo[1].RegClass > 0 &&
-         "Expect >= 2 operands and first two as reg operands");
-
-  // Build the register operands, followed by the optional rotation amount.
-
-  bool ThreeReg = NumOps > 2 && OpInfo[2].RegClass > 0;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRs(insn))));
-  ++OpIdx;
-
-  if (ThreeReg) {
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B,OpInfo[OpIdx].RegClass,
-                                                       decodeRn(insn))));
-    ++OpIdx;
-  }
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, OpInfo[OpIdx].RegClass,
-                                                     decodeRm(insn))));
-  ++OpIdx;
-
-  if (OpIdx < NumOps && OpInfo[OpIdx].RegClass < 0
-      && !OpInfo[OpIdx].isPredicate() && !OpInfo[OpIdx].isOptionalDef()) {
-    // Add the rotation amount immediate.
-    MI.addOperand(MCOperand::CreateImm(decodeRotate(insn)));
-    ++OpIdx;
-  }
-
-  return true;
-}
-
-// A6.3.16 Multiply, multiply accumulate, and absolute difference
-//
-// t2MLA, t2MLS, t2SMMLA, t2SMMLS: Rs Rn Rm Ra=Inst{15-12}
-// t2MUL, t2SMMUL:                 Rs Rn Rm
-// t2SMLA[BB|BT|TB|TT|WB|WT]:      Rs Rn Rm Ra=Inst{15-12}
-// t2SMUL[BB|BT|TB|TT|WB|WT]:      Rs Rn Rm
-//
-// Dual halfword multiply: t2SMUAD[X], t2SMUSD[X], t2SMLAD[X], t2SMLSD[X]:
-//   Rs Rn Rm Ra=Inst{15-12}
-//
-// Unsigned Sum of Absolute Differences [and Accumulate]
-//    Rs Rn Rm [Ra=Inst{15-12}]
-static bool DisassembleThumb2Mul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[2].RegClass == ARM::rGPRRegClassID &&
-         "Expect >= 3 operands and first three as reg operands");
-
-  // Build the register operands.
-
-  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::rGPRRegClassID;
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRs(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRn(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRm(insn))));
-
-  if (FourReg)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                       decodeRd(insn))));
-
-  NumOpsAdded = FourReg ? 4 : 3;
-
-  return true;
-}
-
-// A6.3.17 Long multiply, long multiply accumulate, and divide
-//
-// t2SMULL, t2UMULL, t2SMLAL, t2UMLAL, t2UMAAL: RdLo RdHi Rn Rm
-// where RdLo = Inst{15-12} and RdHi = Inst{11-8}
-//
-// Halfword multiple accumulate long: t2SMLAL<x><y>: RdLo RdHi Rn Rm
-// where RdLo = Inst{15-12} and RdHi = Inst{11-8}
-//
-// Dual halfword multiple: t2SMLALD[X], t2SMLSLD[X]: RdLo RdHi Rn Rm
-// where RdLo = Inst{15-12} and RdHi = Inst{11-8}
-//
-// Signed/Unsigned divide: t2SDIV, t2UDIV: Rs Rn Rm
-static bool DisassembleThumb2LongMul(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO B) {
-
-  const MCOperandInfo *OpInfo = ARMInsts[Opcode].OpInfo;
-
-  assert(NumOps >= 3 &&
-         OpInfo[0].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[1].RegClass == ARM::rGPRRegClassID &&
-         OpInfo[2].RegClass == ARM::rGPRRegClassID &&
-         "Expect >= 3 operands and first three as reg operands");
-
-  bool FourReg = NumOps > 3 && OpInfo[3].RegClass == ARM::rGPRRegClassID;
-
-  // Build the register operands.
-
-  if (FourReg)
-    MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                       decodeRd(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRs(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRn(insn))));
-
-  MI.addOperand(MCOperand::CreateReg(getRegisterEnum(B, ARM::rGPRRegClassID,
-                                                     decodeRm(insn))));
-
-  if (FourReg)
-    NumOpsAdded = 4;
-  else
-    NumOpsAdded = 3;
-
-  return true;
-}
-
-// See A6.3 32-bit Thumb instruction encoding for instruction classes
-// corresponding to (op1, op2, op).
-//
-// Table A6-9 32-bit Thumb instruction encoding
-// op1  op2    op  Instruction class, see
-// ---  -------  --  -----------------------------------------------------------
-// 01  00xx0xx  -  Load/store multiple on page A6-23
-//     00xx1xx  -  Load/store dual, load/store exclusive, table branch on
-//                 page A6-24
-//     01xxxxx  -  Data-processing (shifted register) on page A6-31
-//     1xxxxxx  -  Coprocessor instructions on page A6-40
-// 10  x0xxxxx  0  Data-processing (modified immediate) on page A6-15
-//     x1xxxxx  0  Data-processing (plain binary immediate) on page A6-19
-//         -    1  Branches and miscellaneous control on page A6-20
-// 11  000xxx0  -  Store single data item on page A6-30
-//     001xxx0  -  Advanced SIMD element or structure load/store instructions
-//                 on page A7-27
-//     00xx001  - Load byte, memory hints on page A6-28
-//     00xx011  -  Load halfword, memory hints on page A6-26
-//     00xx101  -  Load word on page A6-25
-//     00xx111  -  UNDEFINED
-//     010xxxx  -  Data-processing (register) on page A6-33
-//     0110xxx  -  Multiply, multiply accumulate, and absolute difference on
-//                 page A6-38
-//     0111xxx  -  Long multiply, long multiply accumulate, and divide on
-//                 page A6-39
-//     1xxxxxx  -  Coprocessor instructions on page A6-40
-//
-static bool DisassembleThumb2(uint16_t op1, uint16_t op2, uint16_t op,
-    MCInst &MI, unsigned Opcode, uint32_t insn, unsigned short NumOps,
-    unsigned &NumOpsAdded, BO B) {
-
-  switch (op1) {
-  case 1:
-    if (slice(op2, 6, 5) == 0) {
-      if (slice(op2, 2, 2) == 0) {
-        // Load/store multiple.
-        return DisassembleThumb2LdStMul(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                        B);
-      }
-
-      // Load/store dual, load/store exclusive, table branch, otherwise.
-      assert(slice(op2, 2, 2) == 1 && "Thumb2 encoding error!");
-      if ((ARM::t2LDREX <= Opcode && Opcode <= ARM::t2LDREXH) ||
-          (ARM::t2STREX <= Opcode && Opcode <= ARM::t2STREXH)) {
-        // Load/store exclusive.
-        return DisassembleThumb2LdStEx(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-      }
-      if (Opcode == ARM::t2LDRDi8 ||
-          Opcode == ARM::t2LDRD_PRE || Opcode == ARM::t2LDRD_POST ||
-          Opcode == ARM::t2STRDi8 ||
-          Opcode == ARM::t2STRD_PRE || Opcode == ARM::t2STRD_POST) {
-        // Load/store dual.
-        return DisassembleThumb2LdStDual(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                         B);
-      }
-      if (Opcode == ARM::t2TBB || Opcode == ARM::t2TBH) {
-        // Table branch.
-        return DisassembleThumb2TB(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      }
-    } else if (slice(op2, 6, 5) == 1) {
-      // Data-processing (shifted register).
-      return DisassembleThumb2DPSoReg(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-    }
-
-    // FIXME: A6.3.18 Coprocessor instructions
-    // But see ThumbDisassembler::getInstruction().
-
-    break;
-  case 2:
-    if (op == 0) {
-      if (slice(op2, 5, 5) == 0)
-        // Data-processing (modified immediate)
-        return DisassembleThumb2DPModImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                         B);
-      if (Thumb2SaturateOpcode(Opcode))
-        return DisassembleThumb2Sat(MI, Opcode, insn, NumOpsAdded, B);
-
-      // Data-processing (plain binary immediate)
-      return DisassembleThumb2DPBinImm(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-    }
-    // Branches and miscellaneous control on page A6-20.
-    return DisassembleThumb2BrMiscCtrl(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                       B);
-  case 3:
-    switch (slice(op2, 6, 5)) {
-    case 0:
-      // Load/store instructions...
-      if (slice(op2, 0, 0) == 0) {
-        if (slice(op2, 4, 4) == 0) {
-          // Store single data item on page A6-30
-          return DisassembleThumb2LdSt(false, MI,Opcode,insn,NumOps,NumOpsAdded,
-                                       B);
-        } else {
-          // FIXME: Advanced SIMD element or structure load/store instructions.
-          // But see ThumbDisassembler::getInstruction().
-          ;
-        }
-      } else {
-        // Table A6-9 32-bit Thumb instruction encoding: Load byte|halfword|word
-        return DisassembleThumb2LdSt(true, MI, Opcode, insn, NumOps,
-                                     NumOpsAdded, B);
-      }
-      break;
-    case 1:
-      if (slice(op2, 4, 4) == 0) {
-        // A6.3.12 Data-processing (register)
-        return DisassembleThumb2DPReg(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      } else if (slice(op2, 3, 3) == 0) {
-        // A6.3.16 Multiply, multiply accumulate, and absolute difference
-        return DisassembleThumb2Mul(MI, Opcode, insn, NumOps, NumOpsAdded, B);
-      } else {
-        // A6.3.17 Long multiply, long multiply accumulate, and divide
-        return DisassembleThumb2LongMul(MI, Opcode, insn, NumOps, NumOpsAdded,
-                                        B);
-      }
-      break;
-    default:
-      // FIXME: A6.3.18 Coprocessor instructions
-      // But see ThumbDisassembler::getInstruction().
-      ;
-      break;
-    }
-
-    break;
-  default:
-    assert(0 && "Thumb2 encoding error!");
-    break;
-  }
-
-  return false;
-}
-
-static bool DisassembleThumbFrm(MCInst &MI, unsigned Opcode, uint32_t insn,
-    unsigned short NumOps, unsigned &NumOpsAdded, BO Builder) {
-
-  uint16_t HalfWord = slice(insn, 31, 16);
-
-  if (HalfWord == 0) {
-    // A6.2 16-bit Thumb instruction encoding
-    // op = bits[15:10]
-    uint16_t op = slice(insn, 15, 10);
-    return DisassembleThumb1(op, MI, Opcode, insn, NumOps, NumOpsAdded,
-                             Builder);
-  }
-
-  unsigned bits15_11 = slice(HalfWord, 15, 11);
-
-  // A6.1 Thumb instruction set encoding
-  if (!(bits15_11 == 0x1D || bits15_11 == 0x1E || bits15_11 == 0x1F)) {
-    assert("Bits[15:11] first halfword of Thumb2 instruction is out of range");
-    return false;
-  }
-
-  // A6.3 32-bit Thumb instruction encoding
-
-  uint16_t op1 = slice(HalfWord, 12, 11);
-  uint16_t op2 = slice(HalfWord, 10, 4);
-  uint16_t op = slice(insn, 15, 15);
-
-  return DisassembleThumb2(op1, op2, op, MI, Opcode, insn, NumOps, NumOpsAdded,
-                           Builder);
-}
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 78d3e47..ccdac3e 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "asm-printer"
-#include "ARMBaseInfo.h"
 #include "ARMInstPrinter.h"
-#include "ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
@@ -25,6 +25,23 @@ using namespace llvm;
 #define GET_INSTRUCTION_NAME
 #include "ARMGenAsmWriter.inc"
 
+/// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
+///
+/// getSORegOffset returns an integer from 0-31, representing '32' as 0.
+static unsigned translateShiftImm(unsigned imm) {
+  if (imm == 0)
+    return 32;
+  return imm;
+}
+
+
+ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
+                               const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
 StringRef ARMInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
 }
@@ -33,11 +50,12 @@ void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
-void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
   unsigned Opcode = MI->getOpcode();
 
   // Check for MOVs and print canonical forms, instead.
-  if (Opcode == ARM::MOVs) {
+  if (Opcode == ARM::MOVsr) {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -51,20 +69,36 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     O << '\t' << getRegisterName(Dst.getReg())
       << ", " << getRegisterName(MO1.getReg());
 
-    if (ARM_AM::getSORegShOp(MO3.getImm()) == ARM_AM::rrx)
-      return;
+    O << ", " << getRegisterName(MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+    printAnnotation(O, Annot);
+    return;
+  }
 
-    O << ", ";
+  if (Opcode == ARM::MOVsi) {
+    // FIXME: Thumb variants?
+    const MCOperand &Dst = MI->getOperand(0);
+    const MCOperand &MO1 = MI->getOperand(1);
+    const MCOperand &MO2 = MI->getOperand(2);
+
+    O << '\t' << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()));
+    printSBitModifierOperand(MI, 5, O);
+    printPredicateOperand(MI, 3, O);
 
-    if (MO2.getReg()) {
-      O << getRegisterName(MO2.getReg());
-      assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
-    } else {
-      O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+    O << '\t' << getRegisterName(Dst.getReg())
+      << ", " << getRegisterName(MO1.getReg());
+
+    if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) {
+      printAnnotation(O, Annot);
+      return;
     }
+
+    O << ", #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
+    printAnnotation(O, Annot);
     return;
   }
 
+
   // A8.6.123 PUSH
   if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) &&
       MI->getOperand(0).getReg() == ARM::SP) {
@@ -74,6 +108,15 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ".w";
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+  if (Opcode == ARM::STR_PRE_IMM && MI->getOperand(2).getReg() == ARM::SP &&
+      MI->getOperand(3).getImm() == -4) {
+    O << '\t' << "push";
+    printPredicateOperand(MI, 4, O);
+    O << "\t{" << getRegisterName(MI->getOperand(1).getReg()) << "}";
+    printAnnotation(O, Annot);
     return;
   }
 
@@ -86,8 +129,18 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ".w";
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
     return;
   }
+  if (Opcode == ARM::LDR_POST_IMM && MI->getOperand(2).getReg() == ARM::SP &&
+      MI->getOperand(4).getImm() == 4) {
+    O << '\t' << "pop";
+    printPredicateOperand(MI, 5, O);
+    O << "\t{" << getRegisterName(MI->getOperand(0).getReg()) << "}";
+    printAnnotation(O, Annot);
+    return;
+  }
+
 
   // A8.6.355 VPUSH
   if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) &&
@@ -96,6 +149,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     printPredicateOperand(MI, 2, O);
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
     return;
   }
 
@@ -106,10 +160,40 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     printPredicateOperand(MI, 2, O);
     O << '\t';
     printRegisterList(MI, 4, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  if (Opcode == ARM::tLDMIA) {
+    bool Writeback = true;
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
+      if (MI->getOperand(i).getReg() == BaseReg)
+        Writeback = false;
+    }
+
+    O << "\tldm";
+
+    printPredicateOperand(MI, 1, O);
+    O << '\t' << getRegisterName(BaseReg);
+    if (Writeback) O << "!";
+    O << ", ";
+    printRegisterList(MI, 3, O);
+    printAnnotation(O, Annot);
+    return;
+  }
+
+  // Thumb1 NOP
+  if (Opcode == ARM::tMOVr && MI->getOperand(0).getReg() == ARM::R8 &&
+      MI->getOperand(1).getReg() == ARM::R8) {
+    O << "\tnop";
+    printPredicateOperand(MI, 2, O);
+    printAnnotation(O, Annot);
     return;
   }
 
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -122,16 +206,38 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << '#' << Op.getImm();
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
+    // If a symbolic branch target was added as a constant expression then print
+    // that address in hex.
+    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
+    int64_t Address;
+    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+      O << "0x";
+      O.write_hex(Address);
+    }
+    else {
+      // Otherwise, just print the expression.
+      O << *Op.getExpr();
+    }
   }
 }
 
+void ARMInstPrinter::printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  if (MO1.isExpr())
+    O << *MO1.getExpr();
+  else if (MO1.isImm())
+    O << "[pc, #" << MO1.getImm() << "]";
+  else
+    llvm_unreachable("Unknown LDR label operand?");
+}
+
 // so_reg is a 4-operand unit corresponding to register forms of the A5.1
 // "Addressing Mode 1 - Data-processing operands" forms.  This includes:
 //    REG 0   0           - e.g. R5
 //    REG REG 0,SH_OPC    - e.g. R5, ROR R3
 //    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
-void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum,
+void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
                                        raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
@@ -144,14 +250,27 @@ void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum,
   O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
   if (ShOpc == ARM_AM::rrx)
     return;
-  if (MO2.getReg()) {
-    O << ' ' << getRegisterName(MO2.getReg());
-    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
-  } else if (ShOpc != ARM_AM::rrx) {
-    O << " #" << ARM_AM::getSORegOffset(MO3.getImm());
-  }
+  
+  O << ' ' << getRegisterName(MO2.getReg());
+  assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+}
+
+void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << getRegisterName(MO1.getReg());
+
+  // Print the shift opc.
+  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
+  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
+  if (ShOpc == ARM_AM::rrx)
+    return;
+  O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
 }
 
+
 //===--------------------------------------------------------------------===//
 // Addressing Mode #2
 //===--------------------------------------------------------------------===//
@@ -209,6 +328,22 @@ void ARMInstPrinter::printAM2PostIndexOp(const MCInst *MI, unsigned Op,
     << " #" << ShImm;
 }
 
+void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << getRegisterName(MO1.getReg()) << ", "
+    << getRegisterName(MO2.getReg()) << "]";
+}
+
+void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
+                                           raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << getRegisterName(MO1.getReg()) << ", "
+    << getRegisterName(MO2.getReg()) << ", lsl #1]";
+}
+
 void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
@@ -284,7 +419,7 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
   O << '[' << getRegisterName(MO1.getReg());
 
   if (MO2.getReg()) {
-    O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm())
+    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
       << getRegisterName(MO2.getReg()) << ']';
     return;
   }
@@ -315,8 +450,8 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
   if (MO1.getReg()) {
-    O << (char)ARM_AM::getAM3Op(MO2.getImm())
-    << getRegisterName(MO1.getReg());
+    O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()))
+      << getRegisterName(MO1.getReg());
     return;
   }
 
@@ -326,6 +461,31 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
     << ImmOffs;
 }
 
+void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff);
+}
+
+void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << (MO2.getImm() ? "" : "-") << getRegisterName(MO1.getReg());
+}
+
+void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI,
+                                             unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  unsigned Imm = MO.getImm();
+  O << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2);
+}
+
+
 void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
   ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(OpNum)
@@ -345,7 +505,9 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
 
   O << "[" << getRegisterName(MO1.getReg());
 
-  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+  unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
+  unsigned Op = ARM_AM::getAM5Op(MO2.getImm());
+  if (ImmOffs || Op == ARM_AM::sub) {
     O << ", #"
       << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
       << ImmOffs * 4;
@@ -402,20 +564,31 @@ void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
 void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
   unsigned ShiftOp = MI->getOperand(OpNum).getImm();
-  ARM_AM::ShiftOpc Opc = ARM_AM::getSORegShOp(ShiftOp);
-  switch (Opc) {
-  case ARM_AM::no_shift:
+  bool isASR = (ShiftOp & (1 << 5)) != 0;
+  unsigned Amt = ShiftOp & 0x1f;
+  if (isASR)
+    O << ", asr #" << (Amt == 0 ? 32 : Amt);
+  else if (Amt)
+    O << ", lsl #" << Amt;
+}
+
+void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
     return;
-  case ARM_AM::lsl:
-    O << ", lsl #";
-    break;
-  case ARM_AM::asr:
-    O << ", asr #";
-    break;
-  default:
-    assert(0 && "unexpected shift opcode for shift immediate operand");
-  }
-  O << ARM_AM::getSORegOffset(ShiftOp);
+  assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
+  O << ", lsl #" << Imm;
+}
+
+void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  // A shift amount of 32 is encoded as 0.
+  if (Imm == 0)
+    Imm = 32;
+  assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
+  O << ", asr #" << Imm;
 }
 
 void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
@@ -450,6 +623,9 @@ void ARMInstPrinter::printCPSIFlag(const MCInst *MI, unsigned OpNum,
   for (int i=2; i >= 0; --i)
     if (IFlags & (1 << i))
       O << ARM_PROC::IFlagsToString(1 << i);
+
+  if (IFlags == 0)
+    O << "none";
 }
 
 void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
@@ -458,10 +634,43 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum,
   unsigned SpecRegRBit = Op.getImm() >> 4;
   unsigned Mask = Op.getImm() & 0xf;
 
+  if (getAvailableFeatures() & ARM::FeatureMClass) {
+    switch (Op.getImm()) {
+    default: assert(0 && "Unexpected mask value!");
+    case 0: O << "apsr"; return;
+    case 1: O << "iapsr"; return;
+    case 2: O << "eapsr"; return;
+    case 3: O << "xpsr"; return;
+    case 5: O << "ipsr"; return;
+    case 6: O << "epsr"; return;
+    case 7: O << "iepsr"; return;
+    case 8: O << "msp"; return;
+    case 9: O << "psp"; return;
+    case 16: O << "primask"; return;
+    case 17: O << "basepri"; return;
+    case 18: O << "basepri_max"; return;
+    case 19: O << "faultmask"; return;
+    case 20: O << "control"; return;
+    }
+  }
+
+  // As special cases, CPSR_f, CPSR_s and CPSR_fs prefer printing as
+  // APSR_nzcvq, APSR_g and APSRnzcvqg, respectively.
+  if (!SpecRegRBit && (Mask == 8 || Mask == 4 || Mask == 12)) {
+    O << "APSR_";
+    switch (Mask) {
+    default: assert(0);
+    case 4:  O << "g"; return;
+    case 8:  O << "nzcvq"; return;
+    case 12: O << "nzcvqg"; return;
+    }
+    llvm_unreachable("Unexpected mask value!");
+  }
+
   if (SpecRegRBit)
-    O << "spsr";
+    O << "SPSR";
   else
-    O << "cpsr";
+    O << "CPSR";
 
   if (Mask) {
     O << '_';
@@ -501,15 +710,20 @@ void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum,
 }
 
 void ARMInstPrinter::printPImmediate(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
+                                     raw_ostream &O) {
   O << "p" << MI->getOperand(OpNum).getImm();
 }
 
 void ARMInstPrinter::printCImmediate(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O) {
+                                     raw_ostream &O) {
   O << "c" << MI->getOperand(OpNum).getImm();
 }
 
+void ARMInstPrinter::printCoprocOptionImm(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  O << "{" << MI->getOperand(OpNum).getImm() << "}";
+}
+
 void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
   llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
@@ -517,7 +731,13 @@ void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
 
 void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
                                             raw_ostream &O) {
-  O << "#" <<  MI->getOperand(OpNum).getImm() * 4;
+  O << "#" << MI->getOperand(OpNum).getImm() * 4;
+}
+
+void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << "#" << (Imm == 0 ? 32 : Imm);
 }
 
 void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
@@ -610,7 +830,7 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
   ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
   O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
   if (ShOpc != ARM_AM::rrx)
-    O << " #" << ARM_AM::getSORegOffset(MO2.getImm());
+    O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
 }
 
 void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
@@ -647,7 +867,9 @@ void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
 
   int32_t OffImm = (int32_t)MO2.getImm();
   // Don't print +0.
-  if (OffImm < 0)
+  if (OffImm == INT32_MIN)
+    O << ", #-0";
+  else if (OffImm < 0)
     O << ", #-" << -OffImm;
   else if (OffImm > 0)
     O << ", #" << OffImm;
@@ -671,6 +893,18 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
   O << "]";
 }
 
+void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
+                                                       unsigned OpNum,
+                                                       raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << "[" << getRegisterName(MO1.getReg());
+  if (MO2.getImm())
+    O << ", #" << MO2.getImm() * 4;
+  O << "]";
+}
+
 void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
                                                       unsigned OpNum,
                                                       raw_ostream &O) {
@@ -678,9 +912,9 @@ void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
   int32_t OffImm = (int32_t)MO1.getImm();
   // Don't print +0.
   if (OffImm < 0)
-    O << "#-" << -OffImm;
-  else if (OffImm > 0)
-    O << "#" << OffImm;
+    O << ", #-" << -OffImm;
+  else
+    O << ", #" << OffImm;
 }
 
 void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
@@ -689,10 +923,13 @@ void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   int32_t OffImm = (int32_t)MO1.getImm() / 4;
   // Don't print +0.
-  if (OffImm < 0)
-    O << "#-" << -OffImm * 4;
-  else if (OffImm > 0)
-    O << "#" << OffImm * 4;
+  if (OffImm != 0) {
+    O << ", ";
+    if (OffImm < 0)
+      O << "#-" << -OffImm * 4;
+    else if (OffImm > 0)
+      O << "#" << OffImm * 4;
+  }
 }
 
 void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
@@ -715,39 +952,10 @@ void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
   O << "]";
 }
 
-void ARMInstPrinter::printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  O << '#';
-  if (MO.isFPImm()) {
-    O << (float)MO.getFPImm();
-  } else {
-    union {
-      uint32_t I;
-      float F;
-    } FPUnion;
-
-    FPUnion.I = MO.getImm();
-    O << FPUnion.F;
-  }
-}
-
-void ARMInstPrinter::printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &O) {
+void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
-  O << '#';
-  if (MO.isFPImm()) {
-    O << MO.getFPImm();
-  } else {
-    // We expect the binary encoding of a floating point number here.
-    union {
-      uint64_t I;
-      double D;
-    } FPUnion;
-
-    FPUnion.I = MO.getImm();
-    O << FPUnion.D;
-  }
+  O << '#' << ARM_AM::getFPImmFloat(MO.getImm());
 }
 
 void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
@@ -757,3 +965,28 @@ void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
   uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
   O << "#0x" << utohexstr(Val);
 }
+
+void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
+                                            raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  O << "#" << Imm + 1;
+}
+
+void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNum).getImm();
+  if (Imm == 0)
+    return;
+  O << ", ror #";
+  switch (Imm) {
+  default: assert (0 && "illegal ror immediate!");
+  case 1: O << "8"; break;
+  case 2: O << "16"; break;
+  case 3: O << "24"; break;
+  }
+}
+
+void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
+}
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index d5f238b..5c2173f 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -15,6 +15,7 @@
 #define ARMINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
@@ -22,10 +23,9 @@ class MCOperand;
 
 class ARMInstPrinter : public MCInstPrinter {
 public:
-  ARMInstPrinter(const MCAsmInfo &MAI)
-    : MCInstPrinter(MAI) {}
+    ARMInstPrinter(const MCAsmInfo &MAI, const MCSubtargetInfo &STI);
 
-  virtual void printInst(const MCInst *MI, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
 
@@ -38,8 +38,11 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
-  void printSORegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSORegRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSORegImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  void printAddrModeTBB(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAddrModeTBH(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode2Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAM2PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
@@ -48,11 +51,15 @@ public:
                                    raw_ostream &O);
 
   void printAddrMode3Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAM3PostIndexOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &O);
   void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
+  void printAM3PostIndexOp(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,raw_ostream &O);
+  void printPostIdxImm8Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
+  void printPostIdxRegOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
+                               raw_ostream &O);
 
   void printLdStmModeOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printAddrMode5Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
@@ -65,8 +72,11 @@ public:
                                       raw_ostream &O);
   void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbITMask(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum,
                                    raw_ostream &O);
@@ -88,6 +98,8 @@ public:
                                   raw_ostream &O);
   void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
                                     raw_ostream &O);
+  void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
+                                    raw_ostream &O);
   void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O);
   void printT2AddrModeImm8s4OffsetOperand(const MCInst *MI, unsigned OpNum,
@@ -108,11 +120,15 @@ public:
   void printNoHashImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printCImmediate(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printVFPf32ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printVFPf64ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printCoprocOptionImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printNEONModImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printImmPlusOneOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printRotImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/InstPrinter/CMakeLists.txt b/lib/Target/ARM/InstPrinter/CMakeLists.txt
index 18645c0..fa0b495 100644
--- a/lib/Target/ARM/InstPrinter/CMakeLists.txt
+++ b/lib/Target/ARM/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMARMAsmPrinter
   ARMInstPrinter.cpp
   )
-add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)
+
+add_dependencies(LLVMARMAsmPrinter ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 595708f..9982fa6 100644
--- a/lib/Target/ARM/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -14,7 +14,8 @@
 #ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
 #define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
 
-#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 
@@ -32,7 +33,8 @@ namespace ARM_AM {
   };
 
   enum AddrOpc {
-    add = '+', sub = '-'
+    sub = 0,
+    add
   };
 
   static inline const char *getAddrOpcStr(AddrOpc Op) {
@@ -60,20 +62,6 @@ namespace ARM_AM {
     }
   }
 
-  static inline ShiftOpc getShiftOpcForNode(SDValue N) {
-    switch (N.getOpcode()) {
-    default:          return ARM_AM::no_shift;
-    case ISD::SHL:    return ARM_AM::lsl;
-    case ISD::SRL:    return ARM_AM::lsr;
-    case ISD::SRA:    return ARM_AM::asr;
-    case ISD::ROTR:   return ARM_AM::ror;
-    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
-    // Can't handle RRX here, because it would require folding a flag into
-    // the addressing mode.  :(  This causes us to miss certain things.
-    //case ARMISD::RRX: return ARM_AM::rrx;
-    }
-  }
-
   enum AMSubMode {
     bad_am_submode = 0,
     ia,
@@ -588,6 +576,90 @@ namespace ARM_AM {
 
   AMSubMode getLoadStoreMultipleSubMode(int Opcode);
 
+  //===--------------------------------------------------------------------===//
+  // Floating-point Immediates
+  //
+  static inline float getFPImmFloat(unsigned Imm) {
+    // We expect an 8-bit binary encoding of a floating-point number here.
+    union {
+      uint32_t I;
+      float F;
+    } FPUnion;
+
+    uint8_t Sign = (Imm >> 7) & 0x1;
+    uint8_t Exp = (Imm >> 4) & 0x7;
+    uint8_t Mantissa = Imm & 0xf;
+
+    //   8-bit FP    iEEEE Float Encoding
+    //   abcd efgh   aBbbbbbc defgh000 00000000 00000000
+    //
+    // where B = NOT(b);
+
+    FPUnion.I = 0;
+    FPUnion.I |= Sign << 31;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0 : 1) << 30;
+    FPUnion.I |= ((Exp & 0x4) != 0 ? 0x1f : 0) << 25;
+    FPUnion.I |= (Exp & 0x3) << 23;
+    FPUnion.I |= Mantissa << 19;
+    return FPUnion.F;
+  }
+
+  /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP32Imm(const APInt &Imm) {
+    uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+    int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+    int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0x7ffff)
+      return -1;
+    Mantissa >>= 19;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP32Imm(const APFloat &FPImm) {
+    return getFP32Imm(FPImm.bitcastToAPInt());
+  }
+
+  /// getFP64Imm - Return an 8-bit floating-point version of the 64-bit
+  /// floating-point value. If the value cannot be represented as an 8-bit
+  /// floating-point value, then return -1.
+  static inline int getFP64Imm(const APInt &Imm) {
+    uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+    int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023; // -1022 to 1023
+    uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffULL;
+
+    // We can handle 4 bits of mantissa.
+    // mantissa = (16+UInt(e:f:g:h))/16.
+    if (Mantissa & 0xffffffffffffULL)
+      return -1;
+    Mantissa >>= 48;
+    if ((Mantissa & 0xf) != Mantissa)
+      return -1;
+
+    // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+    if (Exp < -3 || Exp > 4)
+      return -1;
+    Exp = ((Exp+3) & 0x7) ^ 4;
+
+    return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+  }
+
+  static inline int getFP64Imm(const APFloat &FPImm) {
+    return getFP64Imm(FPImm.bitcastToAPInt());
+  }
+
 } // end namespace ARM_AM
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 5e438a9..c31c5e6 100644
--- a/lib/Target/ARM/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -7,9 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
@@ -19,12 +20,12 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetAsmBackend.h"
-#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
 namespace {
@@ -35,13 +36,24 @@ public:
                               /*HasRelocationAddend*/ false) {}
 };
 
-class ARMAsmBackend : public TargetAsmBackend {
+class ARMAsmBackend : public MCAsmBackend {
+  const MCSubtargetInfo* STI;
   bool isThumbMode;  // Currently emitting Thumb code.
 public:
-  ARMAsmBackend(const Target &T) : TargetAsmBackend(), isThumbMode(false) {}
+  ARMAsmBackend(const Target &T, const StringRef TT)
+    : MCAsmBackend(), STI(ARM_MC::createARMMCSubtargetInfo(TT, "", "")),
+      isThumbMode(TT.startswith("thumb")) {}
+
+  ~ARMAsmBackend() {
+    delete STI;
+  }
 
   unsigned getNumFixupKinds() const { return ARM::NumTargetFixupKinds; }
 
+  bool hasNOP() const {
+    return (STI->getFeatureBits() & ARM::HasV6T2Ops) != 0;
+  }
+
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
     const static MCFixupKindInfo Infos[ARM::NumTargetFixupKinds] = {
 // This table *must* be in the order that the fixup_* kinds are defined in
@@ -65,9 +77,9 @@ public:
 { "fixup_t2_uncondbranch",   0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_br",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_bl",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_blx",     7,            21,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_blx",     0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_cb",      0,            16,  MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_arm_thumb_cp",      1,             8,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_thumb_cp",      0,             8,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_arm_thumb_bcc",     0,             8,  MCFixupKindInfo::FKF_IsPCRel },
 // movw / movt: 16-bits immediate but scattered into two chunks 0 - 12, 16 - 19.
 { "fixup_arm_movt_hi16",     0,            20,  0 },
@@ -81,7 +93,7 @@ public:
     };
 
     if (Kind < FirstTargetFixupKind)
-      return TargetAsmBackend::getFixupKindInfo(Kind);
+      return MCAsmBackend::getFixupKindInfo(Kind);
 
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
@@ -123,20 +135,28 @@ void ARMAsmBackend::RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
 }
 
 bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+  const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
+  const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
+  const uint32_t ARMv4_NopEncoding = 0xe1a0000; // using MOV r0,r0
+  const uint32_t ARMv6T2_NopEncoding = 0xe3207800; // NOP
   if (isThumb()) {
-    // FIXME: 0xbf00 is the ARMv7 value. For v6 and before, we'll need to
-    // use 0x46c0 (which is a 'mov r8, r8' insn).
+    const uint16_t nopEncoding = hasNOP() ? Thumb2_16bitNopEncoding
+                                          : Thumb1_16bitNopEncoding;
     uint64_t NumNops = Count / 2;
     for (uint64_t i = 0; i != NumNops; ++i)
-      OW->Write16(0xbf00);
+      OW->Write16(nopEncoding);
     if (Count & 1)
       OW->Write8(0);
     return true;
   }
   // ARM mode
+  const uint32_t nopEncoding = hasNOP() ? ARMv6T2_NopEncoding
+                                        : ARMv4_NopEncoding;
   uint64_t NumNops = Count / 4;
   for (uint64_t i = 0; i != NumNops; ++i)
-    OW->Write32(0xe1a00000);
+    OW->Write32(nopEncoding);
+  // FIXME: should this function return false when unable to write exactly
+  // 'Count' bytes with NOP encodings?
   switch (Count % 4) {
   default: break; // No leftover bytes to write
   case 1: OW->Write8(0); break;
@@ -163,8 +183,6 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case ARM::fixup_arm_movw_lo16_pcrel: {
     unsigned Hi4 = (Value & 0xF000) >> 12;
     unsigned Lo12 = Value & 0x0FFF;
-    assert ((((int64_t)Value) >= -0x8000) && (((int64_t)Value) <= 0x7fff) &&
-            "Out of range pc-relative fixup value!");
     // inst{19-16} = Hi4;
     // inst{11-0} = Lo12;
     Value = (Hi4 << 16) | (Lo12);
@@ -185,10 +203,6 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // inst{26} = i;
     // inst{14-12} = Mid3;
     // inst{7-0} = Lo8;
-    // The value comes in as the whole thing, not just the portion required
-    // for this fixup, so we need to mask off the bits not handled by this
-    // portion (lo vs. hi).
-    Value &= 0xffff;
     Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
     uint64_t swapped = (Value & 0xFFFF0000) >> 16;
     swapped |= (Value & 0x0000FFFF) << 16;
@@ -382,8 +396,9 @@ namespace {
 class ELFARMAsmBackend : public ARMAsmBackend {
 public:
   Triple::OSType OSType;
-  ELFARMAsmBackend(const Target &T, Triple::OSType _OSType)
-    : ARMAsmBackend(T), OSType(_OSType) { }
+  ELFARMAsmBackend(const Target &T, const StringRef TT,
+                   Triple::OSType _OSType)
+    : ARMAsmBackend(T, TT), OSType(_OSType) { }
 
   void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value) const;
@@ -414,8 +429,9 @@ void ELFARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 class DarwinARMAsmBackend : public ARMAsmBackend {
 public:
   const object::mach::CPUSubtypeARM Subtype;
-  DarwinARMAsmBackend(const Target &T, object::mach::CPUSubtypeARM st)
-    : ARMAsmBackend(T), Subtype(st) { }
+  DarwinARMAsmBackend(const Target &T, const StringRef TT,
+                      object::mach::CPUSubtypeARM st)
+    : ARMAsmBackend(T, TT), Subtype(st) { }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
@@ -492,25 +508,24 @@ void DarwinARMAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 
 } // end anonymous namespace
 
-TargetAsmBackend *llvm::createARMAsmBackend(const Target &T,
-                                            const std::string &TT) {
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin()) {
     if (TheTriple.getArchName() == "armv4t" ||
         TheTriple.getArchName() == "thumbv4t")
-      return new DarwinARMAsmBackend(T, object::mach::CSARM_V4T);
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V4T);
     else if (TheTriple.getArchName() == "armv5e" ||
         TheTriple.getArchName() == "thumbv5e")
-      return new DarwinARMAsmBackend(T, object::mach::CSARM_V5TEJ);
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V5TEJ);
     else if (TheTriple.getArchName() == "armv6" ||
         TheTriple.getArchName() == "thumbv6")
-      return new DarwinARMAsmBackend(T, object::mach::CSARM_V6);
-    return new DarwinARMAsmBackend(T, object::mach::CSARM_V7);
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V6);
+    return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7);
   }
 
   if (TheTriple.isOSWindows())
     assert(0 && "Windows not supported on ARM");
 
-  return new ELFARMAsmBackend(T, Triple(TT).getOS());
+  return new ELFARMAsmBackend(T, TT, Triple(TT).getOS());
 }
diff --git a/lib/Target/ARM/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 458f7dd..ec4b6ff 100644
--- a/lib/Target/ARM/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -17,12 +17,9 @@
 #ifndef ARMBASEINFO_H
 #define ARMBASEINFO_H
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "ARMMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 
-// Note that the following auto-generated files only defined enum types, and
-// so are safe to include here.
-
 namespace llvm {
 
 // Enums corresponding to ARM condition codes
@@ -191,6 +188,22 @@ inline static unsigned getARMRegisterNumbering(unsigned Reg) {
   }
 }
 
+/// isARMLowRegister - Returns true if the register is a low register (r0-r7).
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
 namespace ARMII {
 
   /// ARM Index Modes
@@ -287,6 +300,148 @@ namespace ARMII {
     /// call operand.
     MO_PLT
   };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+    AddrModeMask  = 0x1f, // The AddrMode enums are declared in ARMBaseInfo.h
+
+    // IndexMode - Unindex, pre-indexed, or post-indexed are valid for load
+    // and store ops only.  Generic "updating" flag is used for ld/st multiple.
+    // The index mode enums are declared in ARMBaseInfo.h
+    IndexModeShift = 5,
+    IndexModeMask  = 3 << IndexModeShift,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 7,
+    FormMask      = 0x3f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    LdStExFrm     = 11 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 12 << FormShift,
+    SatFrm        = 13 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 14 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 15 << FormShift,
+    VFPBinaryFrm  = 16 << FormShift,
+    VFPConv1Frm   = 17 << FormShift,
+    VFPConv2Frm   = 18 << FormShift,
+    VFPConv3Frm   = 19 << FormShift,
+    VFPConv4Frm   = 20 << FormShift,
+    VFPConv5Frm   = 21 << FormShift,
+    VFPLdStFrm    = 22 << FormShift,
+    VFPLdStMulFrm = 23 << FormShift,
+    VFPMiscFrm    = 24 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 25 << FormShift,
+
+    // Miscelleaneous format
+    MiscFrm       = 26 << FormShift,
+
+    // NEON formats
+    NGetLnFrm     = 27 << FormShift,
+    NSetLnFrm     = 28 << FormShift,
+    NDupFrm       = 29 << FormShift,
+    NLdStFrm      = 30 << FormShift,
+    N1RegModImmFrm= 31 << FormShift,
+    N2RegFrm      = 32 << FormShift,
+    NVCVTFrm      = 33 << FormShift,
+    NVDupLnFrm    = 34 << FormShift,
+    N2RegVShLFrm  = 35 << FormShift,
+    N2RegVShRFrm  = 36 << FormShift,
+    N3RegFrm      = 37 << FormShift,
+    N3RegVShFrm   = 38 << FormShift,
+    NVExtFrm      = 39 << FormShift,
+    NVMulSLFrm    = 40 << FormShift,
+    NVTBLFrm      = 41 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 13,
+
+    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+    // a 16-bit Thumb instruction if certain conditions are met.
+    Xform16Bit    = 1 << 14,
+
+    // ThumbArithFlagSetting - The instruction is a 16-bit flag setting Thumb
+    // instruction. Used by the parser to determine whether to require the 'S'
+    // suffix on the mnemonic (when not in an IT block) or preclude it (when
+    // in an IT block).
+    ThumbArithFlagSetting = 1 << 18,
+
+    //===------------------------------------------------------------------===//
+    // Code domain.
+    DomainShift   = 15,
+    DomainMask    = 7 << DomainShift,
+    DomainGeneral = 0 << DomainShift,
+    DomainVFP     = 1 << DomainShift,
+    DomainNEON    = 2 << DomainShift,
+    DomainNEONA8  = 4 << DomainShift,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    //
+    // FIXME: This list will need adjusting/fixing as the MC code emitter
+    // takes shape and the ARMCodeEmitter.cpp bits go away.
+    ShiftTypeShift = 4,
+
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+
 } // end namespace ARMII
 
 } // end namespace llvm;
diff --git a/lib/Target/ARM/ARMFixupKinds.h b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
index 350c92d..350c92d 100644
--- a/lib/Target/ARM/ARMFixupKinds.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMFixupKinds.h
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index 53b4c95..1c109e0 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -52,6 +52,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
   AsmTransCBE = arm_asm_table;
   Data64bitsDirective = 0;
   CommentString = "@";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+
   SupportsDebugInformation = true;
 
   // Exceptions handling
@@ -64,12 +67,14 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
 
   Data64bitsDirective = 0;
   CommentString = "@";
-
-  HasLEB128 = true;
   PrivateGlobalPrefix = ".L";
+  Code16Directive = ".code\t16";
+  Code32Directive = ".code\t32";
+
   WeakRefDirective = "\t.weak\t";
-  HasLCOMMDirective = true;
+  LCOMMDirectiveType = LCOMM::NoAlignment;
 
+  HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
diff --git a/lib/Target/ARM/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 39be3f0..865c3e2 100644
--- a/lib/Target/ARM/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -12,17 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mccodeemitter"
-#include "ARM.h"
-#include "ARMAddressingModes.h"
-#include "ARMFixupKinds.h"
-#include "ARMInstrInfo.h"
-#include "ARMMCExpr.h"
-#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -112,11 +113,13 @@ public:
   /// immediate Thumb2 direct branch target.
   uint32_t getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
                                   SmallVectorImpl<MCFixup> &Fixups) const;
-  
+
   /// getARMBranchTargetOpValue - Return encoding info for 24-bit immediate
   /// branch target.
   uint32_t getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
                                      SmallVectorImpl<MCFixup> &Fixups) const;
+  uint32_t getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
 
   /// getAdrLabelOpValue - Return encoding info for 12-bit immediate
   /// ADR label target.
@@ -142,6 +145,16 @@ public:
   uint32_t getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
                                    SmallVectorImpl<MCFixup> &Fixups) const;
 
+  /// getT2AddrModeImm0_1020s4OpValue - Return encoding info for 'reg + imm8<<2'
+  /// operand.
+  uint32_t getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// getT2Imm8s4OpValue - Return encoding info for '+/- imm8<<2'
+  /// operand.
+  uint32_t getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
 
   /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
   /// operand as needed by load/store instructions.
@@ -183,6 +196,10 @@ public:
   uint32_t getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
                                      SmallVectorImpl<MCFixup> &Fixups) const;
 
+  /// getPostIdxRegOpValue - Return encoding for postidx_reg operands.
+  uint32_t getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+
   /// getAddrMode3OffsetOpValue - Return encoding for am3offset operands.
   uint32_t getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
                                      SmallVectorImpl<MCFixup> &Fixups) const;
@@ -251,27 +268,13 @@ public:
     SmallVectorImpl<MCFixup> &Fixups) const;
 
   /// getSORegOpValue - Return an encoded so_reg shifted register value.
-  unsigned getSORegOpValue(const MCInst &MI, unsigned Op,
+  unsigned getSORegRegOpValue(const MCInst &MI, unsigned Op,
+                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getSORegImmOpValue(const MCInst &MI, unsigned Op,
                            SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getT2SORegOpValue(const MCInst &MI, unsigned Op,
                              SmallVectorImpl<MCFixup> &Fixups) const;
 
-  unsigned getRotImmOpValue(const MCInst &MI, unsigned Op,
-                            SmallVectorImpl<MCFixup> &Fixups) const {
-    switch (MI.getOperand(Op).getImm()) {
-    default: assert (0 && "Not a valid rot_imm value!");
-    case 0:  return 0;
-    case 8:  return 1;
-    case 16: return 2;
-    case 24: return 3;
-    }
-  }
-
-  unsigned getImmMinusOneOpValue(const MCInst &MI, unsigned Op,
-                                 SmallVectorImpl<MCFixup> &Fixups) const {
-    return MI.getOperand(Op).getImm() - 1;
-  }
-
   unsigned getNEONVcvtImm32OpValue(const MCInst &MI, unsigned Op,
                                    SmallVectorImpl<MCFixup> &Fixups) const {
     return 64 - MI.getOperand(Op).getImm();
@@ -280,12 +283,6 @@ public:
   unsigned getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
                                       SmallVectorImpl<MCFixup> &Fixups) const;
 
-  unsigned getMsbOpValue(const MCInst &MI, unsigned Op,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
-
-  unsigned getSsatBitPosValue(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups) const;
-
   unsigned getRegisterListOpValue(const MCInst &MI, unsigned Op,
                                   SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
@@ -306,6 +303,9 @@ public:
   unsigned getShiftRight64Imm(const MCInst &MI, unsigned Op,
                               SmallVectorImpl<MCFixup> &Fixups) const;
 
+  unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
   unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
                                       unsigned EncodedValue) const;
   unsigned NEONThumb2LoadStorePostEncoder(const MCInst &MI,
@@ -439,8 +439,10 @@ EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
   bool isAdd = true;
 
   // Special value for #-0
-  if (SImm == INT32_MIN)
+  if (SImm == INT32_MIN) {
     SImm = 0;
+    isAdd = false;
+  }
 
   // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
   if (SImm < 0) {
@@ -470,11 +472,34 @@ static uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
   return 0;
 }
 
+// Thumb BL and BLX use a strange offset encoding where bits 22 and 21 are
+// determined by negating them and XOR'ing them with bit 23.
+static int32_t encodeThumbBLOffset(int32_t offset) {
+  offset >>= 1;
+  uint32_t S  = (offset & 0x800000) >> 23;
+  uint32_t J1 = (offset & 0x400000) >> 22;
+  uint32_t J2 = (offset & 0x200000) >> 21;
+  J1 = (~J1 & 0x1);
+  J2 = (~J2 & 0x1);
+  J1 ^= S;
+  J2 ^= S;
+
+  offset &= ~0x600000;
+  offset |= J1 << 22;
+  offset |= J2 << 21;
+
+  return offset;
+}
+
 /// getThumbBLTargetOpValue - Return encoding info for immediate branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
                         SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl, Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bl,
+                                    Fixups);
+  return encodeThumbBLOffset(MO.getImm());
 }
 
 /// getThumbBLXTargetOpValue - Return encoding info for Thumb immediate
@@ -482,28 +507,43 @@ getThumbBLTargetOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getThumbBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
                          SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx, Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_blx,
+                                    Fixups);
+  return encodeThumbBLOffset(MO.getImm());
 }
 
 /// getThumbBRTargetOpValue - Return encoding info for Thumb branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbBRTargetOpValue(const MCInst &MI, unsigned OpIdx,
                         SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br, Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_br,
+                                    Fixups);
+  return (MO.getImm() >> 1);
 }
 
 /// getThumbBCCTargetOpValue - Return encoding info for Thumb branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbBCCTargetOpValue(const MCInst &MI, unsigned OpIdx,
                          SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc, Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_bcc,
+                                    Fixups);
+  return (MO.getImm() >> 1);
 }
 
 /// getThumbCBTargetOpValue - Return encoding info for Thumb branch target.
 uint32_t ARMMCCodeEmitter::
 getThumbCBTargetOpValue(const MCInst &MI, unsigned OpIdx,
                         SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cb, Fixups);
+  return (MO.getImm() >> 1);
 }
 
 /// Return true if this branch has a non-always predication
@@ -513,9 +553,9 @@ static bool HasConditionalBranch(const MCInst &MI) {
     for (int i = 0; i < NumOp-1; ++i) {
       const MCOperand &MCOp1 = MI.getOperand(i);
       const MCOperand &MCOp2 = MI.getOperand(i + 1);
-      if (MCOp1.isImm() && MCOp2.isReg() && 
+      if (MCOp1.isImm() && MCOp2.isReg() &&
           (MCOp2.getReg() == 0 || MCOp2.getReg() == ARM::CPSR)) {
-        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL) 
+        if (ARMCC::CondCodes(MCOp1.getImm()) != ARMCC::AL)
           return true;
       }
     }
@@ -541,15 +581,32 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getARMBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
                           SmallVectorImpl<MCFixup> &Fixups) const {
-  if (HasConditionalBranch(MI)) 
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx,
+                                      ARM::fixup_arm_condbranch, Fixups);
     return ::getBranchTargetOpValue(MI, OpIdx,
-                                    ARM::fixup_arm_condbranch, Fixups);
-  return ::getBranchTargetOpValue(MI, OpIdx, 
-                                  ARM::fixup_arm_uncondbranch, Fixups);
-}
+                                    ARM::fixup_arm_uncondbranch, Fixups);
+  }
 
+  return MO.getImm() >> 2;
+}
 
+uint32_t ARMMCCodeEmitter::
+getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                          SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr()) {
+    if (HasConditionalBranch(MI))
+      return ::getBranchTargetOpValue(MI, OpIdx,
+                                      ARM::fixup_arm_condbranch, Fixups);
+    return ::getBranchTargetOpValue(MI, OpIdx,
+                                    ARM::fixup_arm_uncondbranch, Fixups);
+  }
 
+  return MO.getImm() >> 1;
+}
 
 /// getUnconditionalBranchTargetOpValue - Return encoding info for 24-bit
 /// immediate branch target.
@@ -579,9 +636,18 @@ getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
                    SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
-                                  Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
+                                    Fixups);
+  int32_t offset = MO.getImm();
+  uint32_t Val = 0x2000;
+  if (offset < 0) {
+    Val = 0x1000;
+    offset *= -1;
+  }
+  Val |= offset;
+  return Val;
 }
 
 /// getAdrLabelOpValue - Return encoding info for 12-bit immediate ADR label
@@ -589,9 +655,16 @@ getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
                    SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
-                                  Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_adr_pcrel_12,
+                                    Fixups);
+  int32_t Val = MO.getImm();
+  if (Val < 0) {
+    Val *= -1;
+    Val |= 0x1000;
+  }
+  return Val;
 }
 
 /// getAdrLabelOpValue - Return encoding info for 8-bit immediate ADR label
@@ -599,9 +672,11 @@ getT2AdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getThumbAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
                    SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getOperand(OpIdx).isExpr() && "Unexpected adr target type!");
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
-                                  Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_thumb_adr_pcrel_10,
+                                    Fixups);
+  return MO.getImm();
 }
 
 /// getThumbAddrModeRegRegOpValue - Return encoding info for 'reg + reg'
@@ -635,17 +710,26 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
     Imm12 = 0;
     isAdd = false ; // 'U' bit is set as part of the fixup.
 
-    assert(MO.isExpr() && "Unexpected machine operand type!");
-    const MCExpr *Expr = MO.getExpr();
-
-    MCFixupKind Kind;
-    if (isThumb2())
-      Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
-    else
-      Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
-    Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+    if (MO.isExpr()) {
+      const MCExpr *Expr = MO.getExpr();
 
-    ++MCNumCPRelocations;
+      MCFixupKind Kind;
+      if (isThumb2())
+        Kind = MCFixupKind(ARM::fixup_t2_ldst_pcrel_12);
+      else
+        Kind = MCFixupKind(ARM::fixup_arm_ldst_pcrel_12);
+      Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+
+      ++MCNumCPRelocations;
+    } else {
+      Reg = ARM::PC;
+      int32_t Offset = MO.getImm();
+      if (Offset < 0) {
+        Offset *= -1;
+        isAdd = false;
+      }
+      Imm12 = Offset;
+    }
   } else
     isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm12, Fixups);
 
@@ -657,6 +741,37 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
   return Binary;
 }
 
+/// getT2Imm8s4OpValue - Return encoding info for
+/// '+/- imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
+                   SmallVectorImpl<MCFixup> &Fixups) const {
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
+
+  // {8}    = (U)nsigned (add == '1', sub == '0')
+  // {7-0}  = imm8
+  int32_t Imm8 = MI.getOperand(OpIdx).getImm();
+  bool isAdd = Imm8 >= 0;
+
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (Imm8 < 0)
+    Imm8 = -Imm8;
+
+  // Scaled by 4.
+  Imm8 /= 4;
+
+  uint32_t Binary = Imm8 & 0xff;
+  // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
+  if (isAdd)
+    Binary |= (1 << 8);
+  return Binary;
+}
+
 /// getT2AddrModeImm8s4OpValue - Return encoding info for
 /// 'reg +/- imm8<<2' operand.
 uint32_t ARMMCCodeEmitter::
@@ -683,6 +798,12 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
   } else
     isAdd = EncodeAddrModeOpValues(MI, OpIdx, Reg, Imm8, Fixups);
 
+  // FIXME: The immediate operand should have already been encoded like this
+  // before ever getting here. The encoder method should just need to combine
+  // the MI operands for the register and the offset into a single
+  // representation for the complex operand in the .td file. This isn't just
+  // style, unfortunately. As-is, we can't represent the distinct encoding
+  // for #-0.
   uint32_t Binary = (Imm8 >> 2) & 0xff;
   // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
   if (isAdd)
@@ -691,6 +812,20 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
   return Binary;
 }
 
+/// getT2AddrModeImm0_1020s4OpValue - Return encoding info for
+/// 'reg + imm8<<2' operand.
+uint32_t ARMMCCodeEmitter::
+getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups) const {
+  // {11-8} = reg
+  // {7-0}  = imm8
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  unsigned Reg = getARMRegisterNumbering(MO.getReg());
+  unsigned Imm8 = MO1.getImm();
+  return (Reg << 8) | Imm8;
+}
+
 // FIXME: This routine assumes that a binary
 // expression will always result in a PCRel expression
 // In reality, its only true if one or more subexpressions
@@ -818,6 +953,17 @@ getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
 }
 
 uint32_t ARMMCCodeEmitter::
+getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  // {4}      isAdd
+  // {3-0}    Rm
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx+1);
+  bool isAdd = MO1.getImm() != 0;
+  return getARMRegisterNumbering(MO.getReg()) | (isAdd << 4);
+}
+
+uint32_t ARMMCCodeEmitter::
 getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
                           SmallVectorImpl<MCFixup> &Fixups) const {
   // {9}      1 == imm8, 0 == Rm
@@ -891,7 +1037,10 @@ getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getAddrModePCOpValue(const MCInst &MI, unsigned OpIdx,
                      SmallVectorImpl<MCFixup> &Fixups) const {
-  return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups);
+  const MCOperand MO = MI.getOperand(OpIdx);
+  if (MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_thumb_cp, Fixups);
+  return (MO.getImm() >> 2);
 }
 
 /// getAddrMode5OpValue - Return encoding info for 'reg +/- imm10' operand.
@@ -934,20 +1083,17 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
 }
 
 unsigned ARMMCCodeEmitter::
-getSORegOpValue(const MCInst &MI, unsigned OpIdx,
+getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
                 SmallVectorImpl<MCFixup> &Fixups) const {
   // Sub-operands are [reg, reg, imm]. The first register is Rm, the reg to be
-  // shifted. The second is either Rs, the amount to shift by, or reg0 in which
-  // case the imm contains the amount to shift by.
+  // shifted. The second is Rs, the amount to shift by, and the third specifies
+  // the type of the shift.
   //
   // {3-0} = Rm.
-  // {4}   = 1 if reg shift, 0 if imm shift
+  // {4}   = 1
   // {6-5} = type
-  //    If reg shift:
-  //      {11-8} = Rs
-  //      {7}    = 0
-  //    else (imm shift)
-  //      {11-7} = imm
+  // {11-8} = Rs
+  // {7}    = 0
 
   const MCOperand &MO  = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
@@ -966,45 +1112,70 @@ getSORegOpValue(const MCInst &MI, unsigned OpIdx,
     // LSR - 0011
     // ASR - 0101
     // ROR - 0111
-    // RRX - 0110 and bit[11:8] clear.
     switch (SOpc) {
     default: llvm_unreachable("Unknown shift opc!");
     case ARM_AM::lsl: SBits = 0x1; break;
     case ARM_AM::lsr: SBits = 0x3; break;
     case ARM_AM::asr: SBits = 0x5; break;
     case ARM_AM::ror: SBits = 0x7; break;
-    case ARM_AM::rrx: SBits = 0x6; break;
-    }
-  } else {
-    // Set shift operand (bit[6:4]).
-    // LSL - 000
-    // LSR - 010
-    // ASR - 100
-    // ROR - 110
-    switch (SOpc) {
-    default: llvm_unreachable("Unknown shift opc!");
-    case ARM_AM::lsl: SBits = 0x0; break;
-    case ARM_AM::lsr: SBits = 0x2; break;
-    case ARM_AM::asr: SBits = 0x4; break;
-    case ARM_AM::ror: SBits = 0x6; break;
     }
   }
 
   Binary |= SBits << 4;
-  if (SOpc == ARM_AM::rrx)
-    return Binary;
 
-  // Encode the shift operation Rs or shift_imm (except rrx).
-  if (Rs) {
-    // Encode Rs bit[11:8].
-    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-    return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
+  // Encode the shift operation Rs.
+  // Encode Rs bit[11:8].
+  assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+  return Binary | (getARMRegisterNumbering(Rs) << ARMII::RegRsShift);
+}
+
+unsigned ARMMCCodeEmitter::
+getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
+                SmallVectorImpl<MCFixup> &Fixups) const {
+  // Sub-operands are [reg, imm]. The first register is Rm, the reg to be
+  // shifted. The second is the amount to shift by.
+  //
+  // {3-0} = Rm.
+  // {4}   = 0
+  // {6-5} = type
+  // {11-7} = imm
+
+  const MCOperand &MO  = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
+
+  // Encode Rm.
+  unsigned Binary = getARMRegisterNumbering(MO.getReg());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+
+  // Set shift operand (bit[6:4]).
+  // LSL - 000
+  // LSR - 010
+  // ASR - 100
+  // ROR - 110
+  // RRX - 110 and bit[11:8] clear.
+  switch (SOpc) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::lsl: SBits = 0x0; break;
+  case ARM_AM::lsr: SBits = 0x2; break;
+  case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::ror: SBits = 0x6; break;
+  case ARM_AM::rrx:
+    Binary |= 0x60;
+    return Binary;
   }
 
   // Encode shift_imm bit[11:7].
-  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
+  Binary |= SBits << 4;
+  unsigned Offset = ARM_AM::getSORegOffset(MO1.getImm());
+  assert(Offset && "Offset must be in range 1-32!");
+  if (Offset == 32) Offset = 0;
+  return Binary | (Offset << 7);
 }
 
+
 unsigned ARMMCCodeEmitter::
 getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
                 SmallVectorImpl<MCFixup> &Fixups) const {
@@ -1106,6 +1277,7 @@ getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
   case ARM_AM::lsl: SBits = 0x0; break;
   case ARM_AM::lsr: SBits = 0x2; break;
   case ARM_AM::asr: SBits = 0x4; break;
+  case ARM_AM::rrx: // FALLTHROUGH
   case ARM_AM::ror: SBits = 0x6; break;
   }
 
@@ -1131,24 +1303,6 @@ getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
 }
 
 unsigned ARMMCCodeEmitter::
-getMsbOpValue(const MCInst &MI, unsigned Op,
-              SmallVectorImpl<MCFixup> &Fixups) const {
-  // MSB - 5 bits.
-  uint32_t lsb = MI.getOperand(Op-1).getImm();
-  uint32_t width = MI.getOperand(Op).getImm();
-  uint32_t msb = lsb+width-1;
-  assert (width != 0 && msb < 32 && "Illegal bit width!");
-  return msb;
-}
-
-unsigned ARMMCCodeEmitter::
-getSsatBitPosValue(const MCInst &MI, unsigned Op,
-                   SmallVectorImpl<MCFixup> &Fixups) const {
-  // For ssat instructions, the bit position should be encoded decremented by 1
-  return MI.getOperand(Op).getImm()-1;
-}
-
-unsigned ARMMCCodeEmitter::
 getRegisterListOpValue(const MCInst &MI, unsigned Op,
                        SmallVectorImpl<MCFixup> &Fixups) const {
   // VLDM/VSTM:
@@ -1158,8 +1312,8 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
   // LDM/STM:
   //   {15-0}  = Bitfield of GPRs.
   unsigned Reg = MI.getOperand(Op).getReg();
-  bool SPRRegs = ARM::SPRRegClass.contains(Reg);
-  bool DPRRegs = ARM::DPRRegClass.contains(Reg);
+  bool SPRRegs = llvm::ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg);
+  bool DPRRegs = llvm::ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg);
 
   unsigned Binary = 0;
 
@@ -1299,7 +1453,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     Size = Desc.getSize();
   else
     llvm_unreachable("Unexpected instruction size!");
-  
+
   uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
   // Thumb 32-bit wide instructions need to emit the high order halfword
   // first.
diff --git a/lib/Target/ARM/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index 2727ba8..2727ba8 100644
--- a/lib/Target/ARM/ARMMCExpr.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
diff --git a/lib/Target/ARM/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index 0a2e883..0a2e883 100644
--- a/lib/Target/ARM/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index f8fcf2b..a55c410 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -13,10 +13,16 @@
 
 #include "ARMMCTargetDesc.h"
 #include "ARMMCAsmInfo.h"
+#include "ARMBaseInfo.h"
+#include "InstPrinter/ARMInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_REGINFO_MC_DESC
 #include "ARMGenRegisterInfo.inc"
@@ -35,7 +41,7 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
   unsigned Len = TT.size();
   unsigned Idx = 0;
 
-  // FIXME: Enahnce Triple helper class to extract ARM version.
+  // FIXME: Enhance Triple helper class to extract ARM version.
   bool isThumb = false;
   if (Len >= 5 && TT.substr(0, 4) == "armv")
     Idx = 4;
@@ -50,18 +56,21 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
     unsigned SubVer = TT[Idx];
     if (SubVer >= '7' && SubVer <= '9') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
-        // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv";
+        // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
+        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
       } else if (Len >= Idx+3 && TT[Idx+1] == 'e'&& TT[Idx+2] == 'm') {
         // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2,
-        //       FeatureT2XtPk
-        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk";
+        //       FeatureT2XtPk, FeatureMClass
+        ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,t2xtpk,+mclass";
       } else
-        // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2
-        ARMArchFeature = "+v7,+neon,+db,+t2dsp";
+        // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+        ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
     } else if (SubVer == '6') {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
         ARMArchFeature = "+v6t2";
+      else if (Len >= Idx+2 && TT[Idx+1] == 'm')
+        // v6m: FeatureNoARM, FeatureMClass
+        ARMArchFeature = "+v6t2,+noarm,+mclass";
       else
         ARMArchFeature = "+v6";
     } else if (SubVer == '5') {
@@ -80,6 +89,14 @@ std::string ARM_MC::ParseARMTriple(StringRef TT) {
       ARMArchFeature += ",+thumb-mode";
   }
 
+  Triple TheTriple(TT);
+  if (TheTriple.getOS() == Triple::NativeClient) {
+    if (ARMArchFeature.empty())
+      ARMArchFeature = "+nacl-mode";
+    else
+      ARMArchFeature += ",+nacl-mode";
+  }
+
   return ARMArchFeature;
 }
 
@@ -98,36 +115,18 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-// Force static initialization.
-extern "C" void LLVMInitializeARMMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheARMTarget,
-                                          ARM_MC::createARMMCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheThumbTarget,
-                                          ARM_MC::createARMMCSubtargetInfo);
-}
-
 static MCInstrInfo *createARMMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitARMMCInstrInfo(X);
   return X;
 }
 
-extern "C" void LLVMInitializeARMMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheARMTarget, createARMMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheThumbTarget, createARMMCInstrInfo);
-}
-
-static MCRegisterInfo *createARMMCRegisterInfo() {
+static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitARMMCRegisterInfo(X);
+  InitARMMCRegisterInfo(X, ARM::LR);
   return X;
 }
 
-extern "C" void LLVMInitializeARMMCRegInfo() {
-  TargetRegistry::RegisterMCRegInfo(TheARMTarget, createARMMCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheThumbTarget, createARMMCRegisterInfo);
-}
-
 static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
 
@@ -137,8 +136,128 @@ static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
   return new ARMELFMCAsmInfo();
 }
 
-extern "C" void LLVMInitializeARMMCAsmInfo() {
-  // Register the target asm info.
+static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default) {
+    Triple TheTriple(TT);
+    // Default relocation model on Darwin is PIC, not DynamicNoPIC.
+    RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
+  }
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+
+  if (TheTriple.isOSWindows()) {
+    llvm_unreachable("ARM does not support Windows COFF format");
+    return NULL;
+  }
+
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(MAI, STI);
+  return 0;
+}
+
+namespace {
+
+class ARMMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  ARMMCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
+
+  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return true;
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  virtual bool isConditionalBranch(const MCInst &Inst) const {
+    // BCCs with the "always" predicate are unconditional branches.
+    if (Inst.getOpcode() == ARM::Bcc && Inst.getOperand(1).getImm()==ARMCC::AL)
+      return false;
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                          uint64_t Size) const {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return -1ULL;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    // FIXME: This is not right for thumb.
+    return Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+  }
+};
+
+}
+
+static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ARMMCInstrAnalysis(Info);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfoFn A(TheARMTarget, createARMMCAsmInfo);
   RegisterMCAsmInfoFn B(TheThumbTarget, createARMMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheARMTarget, createARMMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheThumbTarget, createARMMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheARMTarget, createARMMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheThumbTarget, createARMMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheARMTarget, createARMMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheThumbTarget, createARMMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheARMTarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheThumbTarget,
+                                          ARM_MC::createARMMCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheARMTarget,
+                                          createARMMCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheThumbTarget,
+                                          createARMMCInstrAnalysis);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheARMTarget, createARMMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheThumbTarget, createARMMCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheARMTarget, createARMAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheThumbTarget, createARMAsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 74701e3..9b3d3bd 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -14,12 +14,19 @@
 #ifndef ARMMCTARGETDESC_H
 #define ARMMCTARGETDESC_H
 
+#include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
 class MCSubtargetInfo;
-class Target;
 class StringRef;
+class Target;
+class raw_ostream;
 
 extern Target TheARMTarget, TheThumbTarget;
 
@@ -33,6 +40,18 @@ namespace ARM_MC {
                                             StringRef FS);
 }
 
+MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT);
+
+/// createARMMachObjectWriter - Construct an ARM Mach-O object writer.
+MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
 } // End llvm namespace
 
 // Defines symbolic names for ARM registers.  This defines a mapping from
diff --git a/lib/Target/ARM/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index a36e47d..352c73e 100644
--- a/lib/Target/ARM/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMFixupKinds.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMFixupKinds.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
@@ -19,7 +19,6 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 using namespace llvm::object;
 
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 68daf42..adc37cb 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -1,7 +1,19 @@
 add_llvm_library(LLVMARMDesc
-  ARMMCTargetDesc.cpp
+  ARMAsmBackend.cpp
   ARMMCAsmInfo.cpp
+  ARMMCCodeEmitter.cpp
+  ARMMCExpr.cpp
+  ARMMCTargetDesc.cpp
+  ARMMachObjectWriter.cpp
   )
+add_dependencies(LLVMARMDesc ARMCommonTableGen)
 
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
+
+add_llvm_library_dependencies(LLVMARMDesc
+  LLVMARMInfo
+  LLVMARMAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
index eb8c603..3e48ed1 100644
--- a/lib/Target/ARM/Makefile
+++ b/lib/Target/ARM/Makefile
@@ -16,9 +16,8 @@ BUILT_SOURCES = ARMGenRegisterInfo.inc ARMGenInstrInfo.inc \
 		ARMGenAsmWriter.inc ARMGenAsmMatcher.inc \
                 ARMGenDAGISel.inc ARMGenSubtargetInfo.inc \
                 ARMGenCodeEmitter.inc ARMGenCallingConv.inc \
-                ARMGenDecoderTables.inc ARMGenEDInfo.inc \
-                ARMGenFastISel.inc ARMGenMCCodeEmitter.inc \
-		ARMGenMCPseudoLowering.inc
+                ARMGenEDInfo.inc ARMGenFastISel.inc ARMGenMCCodeEmitter.inc \
+                ARMGenMCPseudoLowering.inc ARMGenDisassemblerTables.inc
 
 DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
 
diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
deleted file mode 100644
index c85d1e9..0000000
--- a/lib/Target/ARM/NEONMoveFix.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-//===-- NEONMoveFix.cpp - Convert vfp reg-reg moves into neon ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "neon-mov-fix"
-#include "ARM.h"
-#include "ARMMachineFunctionInfo.h"
-#include "ARMInstrInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumVMovs, "Number of reg-reg moves converted");
-
-namespace {
-  struct NEONMoveFixPass : public MachineFunctionPass {
-    static char ID;
-    NEONMoveFixPass() : MachineFunctionPass(ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
-
-    virtual const char *getPassName() const {
-      return "NEON reg-reg move conversion";
-    }
-
-  private:
-    const TargetRegisterInfo *TRI;
-    const ARMBaseInstrInfo *TII;
-    bool isA8;
-
-    typedef DenseMap<unsigned, const MachineInstr*> RegMap;
-
-    bool InsertMoves(MachineBasicBlock &MBB);
-  };
-  char NEONMoveFixPass::ID = 0;
-}
-
-static bool inNEONDomain(unsigned Domain, bool isA8) {
-  return (Domain & ARMII::DomainNEON) ||
-    (isA8 && (Domain & ARMII::DomainNEONA8));
-}
-
-bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
-  RegMap Defs;
-  bool Modified = false;
-
-  // Walk over MBB tracking the def points of the registers.
-  MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
-  MachineBasicBlock::iterator NextMII;
-  for (; MII != E; MII = NextMII) {
-    NextMII = llvm::next(MII);
-    MachineInstr *MI = &*MII;
-
-    if (MI->getOpcode() == ARM::VMOVD &&
-        !TII->isPredicated(MI)) {
-      unsigned SrcReg = MI->getOperand(1).getReg();
-      // If we do not find an instruction defining the reg, this means the
-      // register should be live-in for this BB. It's always to better to use
-      // NEON reg-reg moves.
-      unsigned Domain = ARMII::DomainNEON;
-      RegMap::iterator DefMI = Defs.find(SrcReg);
-      if (DefMI != Defs.end()) {
-        Domain = DefMI->second->getDesc().TSFlags & ARMII::DomainMask;
-        // Instructions in general domain are subreg accesses.
-        // Map them to NEON reg-reg moves.
-        if (Domain == ARMII::DomainGeneral)
-          Domain = ARMII::DomainNEON;
-      }
-
-      if (inNEONDomain(Domain, isA8)) {
-        // Convert VMOVD to VORRd
-        unsigned DestReg = MI->getOperand(0).getReg();
-
-        DEBUG({errs() << "vmov convert: "; MI->dump();});
-
-        // It's safe to ignore imp-defs / imp-uses here, since:
-        //  - We're running late, no intelligent condegen passes should be run
-        //    afterwards
-        //  - The imp-defs / imp-uses are superregs only, we don't care about
-        //    them.
-        AddDefaultPred(BuildMI(MBB, *MI, MI->getDebugLoc(),
-                             TII->get(ARM::VORRd), DestReg)
-          .addReg(SrcReg).addReg(SrcReg));
-        MBB.erase(MI);
-        MachineBasicBlock::iterator I = prior(NextMII);
-        MI = &*I;
-
-        DEBUG({errs() << "        into: "; MI->dump();});
-
-        Modified = true;
-        ++NumVMovs;
-      } else {
-        assert((Domain & ARMII::DomainVFP) && "Invalid domain!");
-        // Do nothing.
-      }
-    }
-
-    // Update def information.
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand& MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef())
-        continue;
-      unsigned MOReg = MO.getReg();
-
-      Defs[MOReg] = MI;
-      // Catch aliases as well.
-      for (const unsigned *R = TRI->getAliasSet(MOReg); *R; ++R)
-        Defs[*R] = MI;
-    }
-  }
-
-  return Modified;
-}
-
-bool NEONMoveFixPass::runOnMachineFunction(MachineFunction &Fn) {
-  ARMFunctionInfo *AFI = Fn.getInfo<ARMFunctionInfo>();
-  const TargetMachine &TM = Fn.getTarget();
-
-  if (AFI->isThumb1OnlyFunction())
-    return false;
-
-  TRI = TM.getRegisterInfo();
-  TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
-  isA8 = TM.getSubtarget<ARMSubtarget>().isCortexA8();
-
-  bool Modified = false;
-  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
-       ++MFI) {
-    MachineBasicBlock &MBB = *MFI;
-    Modified |= InsertMoves(MBB);
-  }
-
-  return Modified;
-}
-
-/// createNEONMoveFixPass - Returns an instance of the NEON reg-reg moves fix
-/// pass.
-FunctionPass *llvm::createNEONMoveFixPass() {
-  return new NEONMoveFixPass();
-}
diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
index 163a0a9..500e3de 100644
--- a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "ARM.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheARMTarget, llvm::TheThumbTarget;
diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt
index 3910bb0..8b38b13 100644
--- a/lib/Target/ARM/TargetInfo/CMakeLists.txt
+++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMARMInfo
   ARMTargetInfo.cpp
   )
 
-add_dependencies(LLVMARMInfo ARMCodeGenTable_gen)
+add_dependencies(LLVMARMInfo ARMCommonTableGen)
+
+add_llvm_library_dependencies(LLVMARMInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index c258870..d848177 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -21,7 +21,7 @@
 
 using namespace llvm;
 
-bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+bool Thumb1FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const{
   const MachineFrameInfo *FFI = MF.getFrameInfo();
   unsigned CFSize = FFI->getMaxCallFrameSize();
   // It's not always a good idea to include the call frame as part of the
@@ -133,9 +133,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (hasFP(MF)) {
-    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
       .addFrameIndex(FramePtrSpillFI).addImm(0)
-      .setMIFlags(MachineInstr::FrameSetup);
+      .setMIFlags(MachineInstr::FrameSetup));
     if (NumBytes > 508)
       // If offset is > 508 then sp cannot be adjusted in a single instruction,
       // try restoring from fp instead.
@@ -155,6 +155,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
   AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
 
+  // Thumb1 does not currently support dynamic stack realignment.  Report a
+  // fatal error rather then silently generate bad code.
+  if (RegInfo->needsStackRealignment(MF))
+      report_fatal_error("Dynamic stack realignment not supported for thumb1.");
+
   // If we need a base pointer, set it up here. It's whatever the value
   // of the stack pointer is at this point. Any variable size objects
   // will be allocated after this, so we can still use the base pointer
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 4eb0b6c..e8ed482 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -13,12 +13,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb1RegisterInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -182,7 +181,6 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
   int Opc = 0;
   int ExtraOpc = 0;
   bool NeedCC = false;
-  bool NeedPred = false;
 
   if (DestReg == BaseReg && BaseReg == ARM::SP) {
     assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
@@ -217,7 +215,7 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
     } else {
       Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
       NumBits = 8;
-      NeedPred = NeedCC = true;
+      NeedCC = true;
     }
     isTwoAddr = true;
   }
@@ -241,7 +239,8 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       Bytes -= ThisVal;
       const MCInstrDesc &MCID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
       const MachineInstrBuilder MIB =
-        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg).setMIFlags(MIFlags));
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, MCID, DestReg)
+                         .setMIFlags(MIFlags));
       AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
     } else {
       AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
@@ -262,18 +261,15 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
       if (NeedCC)
         MIB = AddDefaultT1CC(MIB);
       MIB.addReg(DestReg).addImm(ThisVal);
-      if (NeedPred)
-        MIB = AddDefaultPred(MIB);
+      MIB = AddDefaultPred(MIB);
       MIB.setMIFlags(MIFlags);
-    }
-    else {
+    } else {
       bool isKill = BaseReg != ARM::SP;
       MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
       if (NeedCC)
         MIB = AddDefaultT1CC(MIB);
       MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
-      if (NeedPred)
-        MIB = AddDefaultPred(MIB);
+      MIB = AddDefaultPred(MIB);
       MIB.setMIFlags(MIFlags);
 
       BaseReg = DestReg;
@@ -285,7 +281,7 @@ void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
         Scale = 1;
         Chunk = ((1 << NumBits) - 1) * Scale;
         Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
-        NeedPred = NeedCC = isTwoAddr = true;
+        NeedCC = isTwoAddr = true;
       }
     }
   }
@@ -405,7 +401,6 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
     unsigned Scale = 1;
     if (FrameReg != ARM::SP) {
       Opcode = ARM::tADDi3;
-      MI.setDesc(TII.get(Opcode));
       NumBits = 3;
     } else {
       NumBits = 8;
@@ -419,10 +414,9 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
       // Turn it into a move.
       MI.setDesc(TII.get(ARM::tMOVr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-      // Remove offset and add predicate operands.
+      // Remove offset
       MI.RemoveOperand(FrameRegIdx+1);
       MachineInstrBuilder MIB(&MI);
-      AddDefaultPred(MIB);
       return true;
     }
 
@@ -431,6 +425,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
     if (((Offset / Scale) & ~Mask) == 0) {
       // Replace the FrameIndex with sp / fp
       if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
         removeOperands(MI, FrameRegIdx);
         MachineInstrBuilder MIB(&MI);
         AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
@@ -459,6 +454,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
       // r0 = add sp, 255*4
       // r0 = add r0, (imm - 255*4)
       if (Opcode == ARM::tADDi3) {
+        MI.setDesc(TII.get(Opcode));
         removeOperands(MI, FrameRegIdx);
         MachineInstrBuilder MIB(&MI);
         AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
@@ -479,10 +475,6 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
       MI.setDesc(TII.get(ARM::tADDhirr));
       MI.getOperand(FrameRegIdx).ChangeToRegister(DestReg, false, false, true);
       MI.getOperand(FrameRegIdx+1).ChangeToRegister(FrameReg, false);
-      if (Opcode == ARM::tADDi3) {
-        MachineInstrBuilder MIB(&MI);
-        AddDefaultPred(MIB);
-      }
     }
     return true;
   } else {
@@ -545,9 +537,9 @@ Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
     ++i;
     assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
   }
-  bool Done = false;
-  Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
+  bool Done = rewriteFrameIndex(MI, i, BaseReg, Off, TII);
   assert (Done && "Unable to resolve frame index!");
+  (void)Done;
 }
 
 /// saveScavengerRegister - Spill the register so it can be used by the
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 360ec00..b627400 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -124,6 +124,27 @@ Thumb2ITBlockPass::MoveCopyOutOfITBlock(MachineInstr *MI,
   if (Uses.count(DstReg) || Defs.count(SrcReg))
     return false;
 
+  // If the CPSR is defined by this copy, then we don't want to move it. E.g.,
+  // if we have:
+  //
+  //   movs  r1, r1
+  //   rsb   r1, 0
+  //   movs  r2, r2
+  //   rsb   r2, 0
+  //
+  // we don't want this to be converted to:
+  //
+  //   movs  r1, r1
+  //   movs  r2, r2
+  //   itt   mi
+  //   rsb   r1, 0
+  //   rsb   r2, 0
+  //
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.hasOptionalDef() &&
+      MI->getOperand(MCID.getNumOperands() - 1).getReg() == ARM::CPSR)
+    return false;
+
   // Then peek at the next instruction to see if it's predicated on CC or OCC.
   // If not, then there is nothing to be gained by moving the copy.
   MachineBasicBlock::iterator I = MI; ++I;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index 51b56aa..cf040c82 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -14,9 +14,9 @@
 #include "Thumb2InstrInfo.h"
 #include "ARM.h"
 #include "ARMConstantPoolValue.h"
-#include "ARMAddressingModes.h"
 #include "ARMMachineFunctionInfo.h"
 #include "Thumb2InstrInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -122,7 +122,8 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass) {
+      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
+      RC == ARM::GPRnopcRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -149,7 +150,8 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
   if (RC == ARM::GPRRegisterClass   || RC == ARM::tGPRRegisterClass ||
-      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass) {
+      RC == ARM::tcGPRRegisterClass || RC == ARM::rGPRRegisterClass ||
+      RC == ARM::GPRnopcRegisterClass) {
     DebugLoc DL;
     if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -233,9 +235,8 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
       if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
         assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
         Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
-        // FIXME: Fix Thumb1 immediate encoding.
-        BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
-          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags);
+        AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4).setMIFlags(MIFlags));
         NumBytes = 0;
         continue;
       }
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index c741a6e..89a155c 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -9,11 +9,11 @@
 
 #define DEBUG_TYPE "t2-reduce-size"
 #include "ARM.h"
-#include "ARMAddressingModes.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
 #include "Thumb2InstrInfo.h"
+#include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -97,11 +97,11 @@ namespace {
     { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0,0 },
     { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0,0 },
     { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0,0 },
-    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
-    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2SXTB,  ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2SXTH,  ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0,1 },
     { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0,0 },
-    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,0 },
-    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,0 },
+    { ARM::t2UXTB,  ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0,1 },
+    { ARM::t2UXTH,  ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0,1 },
 
     // FIXME: Clean this up after splitting each Thumb load / store opcode
     // into multiple ones.
@@ -507,6 +507,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
       .addOperand(MI->getOperand(0))
       .addOperand(MI->getOperand(1))
       .addImm(Imm / 4); // The tADDrSPi has an implied scale by four.
+    AddDefaultPred(MIB);
 
     // Transfer MI flags.
     MIB.setMIFlags(MI->getFlags());
@@ -546,6 +547,10 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   }
   case ARM::t2RSBri:
   case ARM::t2RSBSri:
+  case ARM::t2SXTB:
+  case ARM::t2SXTH:
+  case ARM::t2UXTB:
+  case ARM::t2UXTH:
     if (MI->getOperand(2).getImm() == 0)
       return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef);
     break;
@@ -742,7 +747,11 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
     if (i < NumOps && MCID.OpInfo[i].isOptionalDef())
       continue;
     if ((MCID.getOpcode() == ARM::t2RSBSri ||
-         MCID.getOpcode() == ARM::t2RSBri) && i == 2)
+         MCID.getOpcode() == ARM::t2RSBri ||
+         MCID.getOpcode() == ARM::t2SXTB ||
+         MCID.getOpcode() == ARM::t2SXTH ||
+         MCID.getOpcode() == ARM::t2UXTB ||
+         MCID.getOpcode() == ARM::t2UXTH) && i == 2)
       // Skip the zero immediate operand, it's now implicit.
       continue;
     bool isPred = (i < NumOps && MCID.OpInfo[i].isPredicate());
diff --git a/lib/Target/Alpha/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AlphaAsmPrinter.cpp
index 46ae286..5dce06a 100644
--- a/lib/Target/Alpha/AlphaAsmPrinter.cpp
+++ b/lib/Target/Alpha/AlphaAsmPrinter.cpp
@@ -26,8 +26,8 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
index 7b91fea..f877c65 100644
--- a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
@@ -80,7 +80,7 @@ namespace {
             // Otherwise we don't know that the it's okay to zapnot this entire
             // byte.  Only do this iff we can prove that the missing bits are
             // already null, so the bytezap doesn't need to really null them.
-            BitsToCheck |= ~Constant & (0xFF << 8*i);
+            BitsToCheck |= ~Constant & (0xFFULL << 8*i);
           }
         }
       }
@@ -114,9 +114,8 @@ namespace {
       if (!x) return 0;
       unsigned at = CountLeadingZeros_64(x);
       uint64_t complow = 1ULL << (63 - at);
-      uint64_t comphigh = 1ULL << (64 - at);
-      //cerr << x << ":" << complow << ":" << comphigh << "\n";
-      if (abs64(complow - x) <= abs64(comphigh - x))
+      uint64_t comphigh = complow << 1;
+      if (x - complow <= comphigh - x)
         return complow;
       else
         return comphigh;
diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
index de003fb..3057eb8 100644
--- a/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -49,6 +49,7 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
   // Set up the TargetLowering object.
   //I am having problems with shr n i8 1
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass);
   addRegisterClass(MVT::f64, Alpha::F8RCRegisterClass);
@@ -153,6 +154,9 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
   setOperationAction(ISD::JumpTable, MVT::i32, Custom);
 
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+
   setStackPointerRegisterToSaveRestore(Alpha::R30);
 
   setJumpBufSize(272);
@@ -160,10 +164,12 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
 
   setMinFunctionAlignment(4);
 
+  setInsertFencesForAtomic(true);
+
   computeRegisterProperties();
 }
 
-MVT::SimpleValueType AlphaTargetLowering::getSetCCResultType(EVT VT) const {
+EVT AlphaTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i64;
 }
 
diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
index 13383f4..80f8efa 100644
--- a/lib/Target/Alpha/AlphaISelLowering.h
+++ b/lib/Target/Alpha/AlphaISelLowering.h
@@ -66,7 +66,7 @@ namespace llvm {
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i64; }
 
     /// getSetCCResultType - Get the SETCC result ValueType
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
index 4dcec8f..8df2ed7 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.cpp
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp
@@ -16,7 +16,6 @@
 #include "AlphaMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
index b201712..c8c9377 100644
--- a/lib/Target/Alpha/AlphaInstrInfo.td
+++ b/lib/Target/Alpha/AlphaInstrInfo.td
@@ -607,6 +607,8 @@ def : Pat<(membarrier (i64 imm), (i64 imm), (i64 imm), (i64 1), (i64 imm)),
 def : Pat<(membarrier (i64 imm), (i64 imm), (i64 imm), (i64 imm), (i64 imm)),
           (MB)>;
 
+def : Pat<(atomic_fence (imm), (imm)), (MB)>;
+
 //Basic Floating point ops
 
 //Floats
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
index df8f157..8b6230f 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.cpp
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp
@@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -40,8 +39,7 @@
 using namespace llvm;
 
 AlphaRegisterInfo::AlphaRegisterInfo(const TargetInstrInfo &tii)
-  : AlphaGenRegisterInfo(),
-    TII(tii) {
+  : AlphaGenRegisterInfo(Alpha::R26), TII(tii) {
 }
 
 static long getUpper16(long l) {
@@ -178,10 +176,6 @@ AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
-unsigned AlphaRegisterInfo::getRARegister() const {
-  return Alpha::R26;
-}
-
 unsigned AlphaRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
@@ -198,16 +192,6 @@ unsigned AlphaRegisterInfo::getEHHandlerRegister() const {
   return 0;
 }
 
-int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
-
-int AlphaRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
-
 std::string AlphaRegisterInfo::getPrettyName(unsigned reg)
 {
   std::string s(AlphaRegDesc[reg].Name);
diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
index 1072bf7..e35be27 100644
--- a/lib/Target/Alpha/AlphaRegisterInfo.h
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h
@@ -42,16 +42,12 @@ struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
 
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
   static std::string getPrettyName(unsigned reg);
 };
 
diff --git a/lib/Target/Alpha/AlphaSubtarget.cpp b/lib/Target/Alpha/AlphaSubtarget.cpp
index 624a5e2..bd55ce9 100644
--- a/lib/Target/Alpha/AlphaSubtarget.cpp
+++ b/lib/Target/Alpha/AlphaSubtarget.cpp
@@ -13,7 +13,6 @@
 
 #include "AlphaSubtarget.h"
 #include "Alpha.h"
-#include "llvm/Target/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/Alpha/AlphaTargetMachine.cpp b/lib/Target/Alpha/AlphaTargetMachine.cpp
index 3b65d41..fc9a677 100644
--- a/lib/Target/Alpha/AlphaTargetMachine.cpp
+++ b/lib/Target/Alpha/AlphaTargetMachine.cpp
@@ -14,7 +14,7 @@
 #include "AlphaTargetMachine.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeAlphaTarget() { 
@@ -22,19 +22,17 @@ extern "C" void LLVMInitializeAlphaTarget() {
   RegisterTargetMachine<AlphaTargetMachine> X(TheAlphaTarget);
 }
 
-AlphaTargetMachine::AlphaTargetMachine(const Target &T, const std::string &TT,
-                                       const std::string &CPU,
-                                       const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+AlphaTargetMachine::AlphaTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     DataLayout("e-f128:128:128-n64"),
     FrameLowering(Subtarget),
     Subtarget(TT, CPU, FS),
     TLInfo(*this),
     TSInfo(*this) {
-  setRelocationModel(Reloc::PIC_);
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Alpha/AlphaTargetMachine.h b/lib/Target/Alpha/AlphaTargetMachine.h
index cf00e58..48bb948 100644
--- a/lib/Target/Alpha/AlphaTargetMachine.h
+++ b/lib/Target/Alpha/AlphaTargetMachine.h
@@ -36,8 +36,9 @@ class AlphaTargetMachine : public LLVMTargetMachine {
   AlphaSelectionDAGInfo TSInfo;
 
 public:
-  AlphaTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  AlphaTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 
   virtual const AlphaInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameLowering  *getFrameLowering() const {
diff --git a/lib/Target/Alpha/CMakeLists.txt b/lib/Target/Alpha/CMakeLists.txt
index a6027bb..a6d5516 100644
--- a/lib/Target/Alpha/CMakeLists.txt
+++ b/lib/Target/Alpha/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS Alpha.td)
 
-tablegen(AlphaGenRegisterInfo.inc -gen-register-info)
-tablegen(AlphaGenInstrInfo.inc -gen-instr-info)
-tablegen(AlphaGenAsmWriter.inc -gen-asm-writer)
-tablegen(AlphaGenDAGISel.inc -gen-dag-isel)
-tablegen(AlphaGenCallingConv.inc -gen-callingconv)
-tablegen(AlphaGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(AlphaGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(AlphaGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(AlphaGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(AlphaGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(AlphaGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(AlphaGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(AlphaCommonTableGen)
 
 add_llvm_target(AlphaCodeGen
   AlphaAsmPrinter.cpp
@@ -21,5 +22,17 @@ add_llvm_target(AlphaCodeGen
   AlphaSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMAlphaCodeGen
+  LLVMAlphaDesc
+  LLVMAlphaInfo
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp b/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp
index 562052b..4ad021c 100644
--- a/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp
+++ b/lib/Target/Alpha/MCTargetDesc/AlphaMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "AlphaMCTargetDesc.h"
 #include "AlphaMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "AlphaGenInstrInfo.inc"
@@ -36,8 +37,10 @@ static MCInstrInfo *createAlphaMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeAlphaMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheAlphaTarget, createAlphaMCInstrInfo);
+static MCRegisterInfo *createAlphaMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAlphaMCRegisterInfo(X, Alpha::R26);
+  return X;
 }
 
 static MCSubtargetInfo *createAlphaMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -47,11 +50,29 @@ static MCSubtargetInfo *createAlphaMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeAlphaMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheAlphaTarget,
-                                          createAlphaMCSubtargetInfo);
+static MCCodeGenInfo *createAlphaMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(Reloc::PIC_, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeAlphaMCAsmInfo() {
+// Force static initialization.
+extern "C" void LLVMInitializeAlphaTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<AlphaMCAsmInfo> X(TheAlphaTarget);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheAlphaTarget,
+                                        createAlphaMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheAlphaTarget, createAlphaMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheAlphaTarget, createAlphaMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheAlphaTarget,
+                                          createAlphaMCSubtargetInfo);
 }
diff --git a/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt b/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt
index ad0dd26..f745ecb 100644
--- a/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Alpha/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,10 @@ add_llvm_library(LLVMAlphaDesc
   AlphaMCTargetDesc.cpp
   AlphaMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMAlphaDesc
+  LLVMAlphaInfo
+  LLVMMC
+  )
+
+add_dependencies(LLVMAlphaDesc AlphaCommonTableGen)
diff --git a/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
index f7099b9..bdc69e7 100644
--- a/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
+++ b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "Alpha.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 llvm::Target llvm::TheAlphaTarget;
diff --git a/lib/Target/Alpha/TargetInfo/CMakeLists.txt b/lib/Target/Alpha/TargetInfo/CMakeLists.txt
index 2a7291b..cac3178 100644
--- a/lib/Target/Alpha/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Alpha/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMAlphaInfo
   AlphaTargetInfo.cpp
   )
 
-add_dependencies(LLVMAlphaInfo AlphaCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMAlphaInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMAlphaInfo AlphaCommonTableGen)
diff --git a/lib/Target/Blackfin/BlackfinAsmPrinter.cpp b/lib/Target/Blackfin/BlackfinAsmPrinter.cpp
index 6ba258b..ed9844e 100644
--- a/lib/Target/Blackfin/BlackfinAsmPrinter.cpp
+++ b/lib/Target/Blackfin/BlackfinAsmPrinter.cpp
@@ -29,9 +29,9 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/Blackfin/BlackfinFrameLowering.h b/lib/Target/Blackfin/BlackfinFrameLowering.h
index 726fa2c..169aa8e 100644
--- a/lib/Target/Blackfin/BlackfinFrameLowering.h
+++ b/lib/Target/Blackfin/BlackfinFrameLowering.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ALPHA_FRAMEINFO_H
-#define ALPHA_FRAMEINFO_H
+#ifndef BLACKFIN_FRAMEINFO_H
+#define BLACKFIN_FRAMEINFO_H
 
 #include "Blackfin.h"
 #include "BlackfinSubtarget.h"
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
index d572832..7d4c45f 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.cpp
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp
@@ -42,6 +42,7 @@ using namespace llvm;
 BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
   setStackPointerRegisterToSaveRestore(BF::SP);
   setIntDivIsCheap(false);
 
@@ -99,6 +100,7 @@ BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
 
   // Blackfin has no intrinsics for these particular operations.
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
@@ -134,7 +136,7 @@ const char *BlackfinTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-MVT::SimpleValueType BlackfinTargetLowering::getSetCCResultType(EVT VT) const {
+EVT BlackfinTargetLowering::getSetCCResultType(EVT VT) const {
   // SETCC always sets the CC register. Technically that is an i1 register, but
   // that type is not legal, so we treat it as an i32 register.
   return MVT::i32;
diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
index b65775b..90908ba 100644
--- a/lib/Target/Blackfin/BlackfinISelLowering.h
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h
@@ -33,7 +33,7 @@ namespace llvm {
   public:
     BlackfinTargetLowering(TargetMachine &TM);
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i16; }
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
     virtual void ReplaceNodeResults(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
index d190ae7..c06a919 100644
--- a/lib/Target/Blackfin/BlackfinInstrInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
@@ -16,10 +16,10 @@
 #include "Blackfin.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "BlackfinGenInstrInfo.inc"
diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
index ae8ee9e..7135676 100644
--- a/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
@@ -34,7 +34,7 @@ namespace bfinIntrinsic {
 
 }
 
-std::string BlackfinIntrinsicInfo::getName(unsigned IntrID, const Type **Tys,
+std::string BlackfinIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
                                            unsigned numTys) const {
   static const char *const names[] = {
 #define GET_INTRINSIC_NAME_TABLE
@@ -81,8 +81,8 @@ bool BlackfinIntrinsicInfo::isOverloaded(unsigned IntrID) const {
 #include "BlackfinGenIntrinsics.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 
-static const FunctionType *getType(LLVMContext &Context, unsigned id) {
-  const Type *ResultTy = NULL;
+static FunctionType *getType(LLVMContext &Context, unsigned id) {
+  Type *ResultTy = NULL;
   std::vector<Type*> ArgTys;
   bool IsVarArg = false;
   
@@ -94,7 +94,7 @@ static const FunctionType *getType(LLVMContext &Context, unsigned id) {
 }
 
 Function *BlackfinIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-                                                const Type **Tys,
+                                                Type **Tys,
                                                 unsigned numTy) const {
   assert(!isOverloaded(IntrID) && "Blackfin intrinsics are not overloaded");
   AttrListPtr AList = getAttributes((bfinIntrinsic::ID) IntrID);
diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.h b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
index 7c4b5a9..f05db5a 100644
--- a/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
@@ -19,11 +19,11 @@ namespace llvm {
 
   class BlackfinIntrinsicInfo : public TargetIntrinsicInfo {
   public:
-    std::string getName(unsigned IntrID, const Type **Tys = 0,
+    std::string getName(unsigned IntrID, Type **Tys = 0,
                         unsigned numTys = 0) const;
     unsigned lookupName(const char *Name, unsigned Len) const;
     bool isOverloaded(unsigned IID) const;
-    Function *getDeclaration(Module *M, unsigned ID, const Type **Tys = 0,
+    Function *getDeclaration(Module *M, unsigned ID, Type **Tys = 0,
                              unsigned numTys = 0) const;
   };
 
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
index 3a7c104..0d415c5 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -37,7 +36,7 @@ using namespace llvm;
 
 BlackfinRegisterInfo::BlackfinRegisterInfo(BlackfinSubtarget &st,
                                            const TargetInstrInfo &tii)
-  : BlackfinGenRegisterInfo(), Subtarget(st), TII(tii) {}
+  : BlackfinGenRegisterInfo(BF::RETS), Subtarget(st), TII(tii) {}
 
 const unsigned*
 BlackfinRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
@@ -327,10 +326,6 @@ BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
-unsigned BlackfinRegisterInfo::getRARegister() const {
-  return BF::RETS;
-}
-
 unsigned
 BlackfinRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -347,14 +342,3 @@ unsigned BlackfinRegisterInfo::getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int BlackfinRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
-
-int BlackfinRegisterInfo::getLLVMRegNum(unsigned DwarfRegNum,
-                                        bool isEH) const {
-  llvm_unreachable("What is the dwarf register number");
-  return -1;
-}
diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
index 86f45c1..6ac22af 100644
--- a/lib/Target/Blackfin/BlackfinRegisterInfo.h
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h
@@ -53,15 +53,11 @@ namespace llvm {
                              int SPAdj, RegScavenger *RS = NULL) const;
 
     unsigned getFrameRegister(const MachineFunction &MF) const;
-    unsigned getRARegister() const;
 
     // Exception handling queries.
     unsigned getEHExceptionRegister() const;
     unsigned getEHHandlerRegister() const;
 
-    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
     // Utility functions
     void adjustRegister(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator I,
diff --git a/lib/Target/Blackfin/BlackfinSubtarget.cpp b/lib/Target/Blackfin/BlackfinSubtarget.cpp
index ec919cd..0bdce09 100644
--- a/lib/Target/Blackfin/BlackfinSubtarget.cpp
+++ b/lib/Target/Blackfin/BlackfinSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "BlackfinSubtarget.h"
 #include "Blackfin.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.cpp b/lib/Target/Blackfin/BlackfinTargetMachine.cpp
index a1c9f1c..a4ae46b 100644
--- a/lib/Target/Blackfin/BlackfinTargetMachine.cpp
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.cpp
@@ -13,7 +13,7 @@
 #include "BlackfinTargetMachine.h"
 #include "Blackfin.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -22,10 +22,12 @@ extern "C" void LLVMInitializeBlackfinTarget() {
 }
 
 BlackfinTargetMachine::BlackfinTargetMachine(const Target &T,
-                                             const std::string &TT,
-                                             const std::string &CPU,
-                                             const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+                                             StringRef TT,
+                                             StringRef CPU,
+                                             StringRef FS,
+                                             Reloc::Model RM,
+                                             CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     DataLayout("e-p:32:32-i64:32-f64:32-n32"),
     Subtarget(TT, CPU, FS),
     TLInfo(*this),
diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.h b/lib/Target/Blackfin/BlackfinTargetMachine.h
index bd7dc84..c85337fe2 100644
--- a/lib/Target/Blackfin/BlackfinTargetMachine.h
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.h
@@ -35,8 +35,9 @@ namespace llvm {
     BlackfinFrameLowering FrameLowering;
     BlackfinIntrinsicInfo IntrinsicInfo;
   public:
-    BlackfinTargetMachine(const Target &T, const std::string &TT,
-                          const std::string &CPU, const std::string &FS);
+    BlackfinTargetMachine(const Target &T, StringRef TT,
+                          StringRef CPU, StringRef FS,
+                          Reloc::Model RM, CodeModel::Model CM);
 
     virtual const BlackfinInstrInfo *getInstrInfo() const { return &InstrInfo; }
     virtual const TargetFrameLowering *getFrameLowering() const {
diff --git a/lib/Target/Blackfin/CMakeLists.txt b/lib/Target/Blackfin/CMakeLists.txt
index d3f33a9..94d05fb 100644
--- a/lib/Target/Blackfin/CMakeLists.txt
+++ b/lib/Target/Blackfin/CMakeLists.txt
@@ -1,12 +1,13 @@
 set(LLVM_TARGET_DEFINITIONS Blackfin.td)
 
-tablegen(BlackfinGenRegisterInfo.inc -gen-register-info)
-tablegen(BlackfinGenInstrInfo.inc -gen-instr-info)
-tablegen(BlackfinGenAsmWriter.inc -gen-asm-writer)
-tablegen(BlackfinGenDAGISel.inc -gen-dag-isel)
-tablegen(BlackfinGenSubtargetInfo.inc -gen-subtarget)
-tablegen(BlackfinGenCallingConv.inc -gen-callingconv)
-tablegen(BlackfinGenIntrinsics.inc -gen-tgt-intrinsic)
+llvm_tablegen(BlackfinGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(BlackfinGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(BlackfinGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(BlackfinGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(BlackfinGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(BlackfinGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(BlackfinGenIntrinsics.inc -gen-tgt-intrinsic)
+add_public_tablegen_target(BlackfinCommonTableGen)
 
 add_llvm_target(BlackfinCodeGen
   BlackfinAsmPrinter.cpp
@@ -21,5 +22,17 @@ add_llvm_target(BlackfinCodeGen
   BlackfinSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMBlackfinCodeGen
+  LLVMAsmPrinter
+  LLVMBlackfinDesc
+  LLVMBlackfinInfo
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp b/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp
index 0fa1471..272e3c2 100644
--- a/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp
+++ b/lib/Target/Blackfin/MCTargetDesc/BlackfinMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "BlackfinMCTargetDesc.h"
 #include "BlackfinMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "BlackfinGenInstrInfo.inc"
@@ -36,12 +37,12 @@ static MCInstrInfo *createBlackfinMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeBlackfinMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheBlackfinTarget,
-                                      createBlackfinMCInstrInfo);
+static MCRegisterInfo *createBlackfinMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitBlackfinMCRegisterInfo(X, BF::RETS);
+  return X;
 }
 
-
 static MCSubtargetInfo *createBlackfinMCSubtargetInfo(StringRef TT,
                                                       StringRef CPU,
                                                       StringRef FS) {
@@ -50,11 +51,31 @@ static MCSubtargetInfo *createBlackfinMCSubtargetInfo(StringRef TT,
   return X;
 }
 
-extern "C" void LLVMInitializeBlackfinMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheBlackfinTarget,
-                                          createBlackfinMCSubtargetInfo);
+static MCCodeGenInfo *createBlackfinMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                  CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeBlackfinMCAsmInfo() {
+// Force static initialization.
+extern "C" void LLVMInitializeBlackfinTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<BlackfinMCAsmInfo> X(TheBlackfinTarget);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheBlackfinTarget,
+                                        createBlackfinMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheBlackfinTarget,
+                                      createBlackfinMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheBlackfinTarget,
+                                    createBlackfinMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheBlackfinTarget,
+                                          createBlackfinMCSubtargetInfo);
 }
diff --git a/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt b/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt
index 8cd924f..73315d8 100644
--- a/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Blackfin/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,10 @@ add_llvm_library(LLVMBlackfinDesc
   BlackfinMCTargetDesc.cpp
   BlackfinMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMBlackfinDesc
+  LLVMBlackfinInfo
+  LLVMMC
+  )
+
+add_dependencies(LLVMBlackfinDesc BlackfinCommonTableGen)
diff --git a/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
index 402e0af..57f1d3e 100644
--- a/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
+++ b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "Blackfin.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Blackfin/TargetInfo/CMakeLists.txt b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
index 5ca8060..771f092 100644
--- a/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMBlackfinInfo
   BlackfinTargetInfo.cpp
   )
 
-add_dependencies(LLVMBlackfinInfo BlackfinCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMBlackfinInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMBlackfinInfo BlackfinCommonTableGen)
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 415beb1..69d8c46 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -37,10 +37,11 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -48,6 +49,7 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
@@ -62,12 +64,6 @@ extern "C" void LLVMInitializeCBackendTarget() {
   RegisterTargetMachine<CTargetMachine> X(TheCBackendTarget);
 }
 
-extern "C" void LLVMInitializeCBackendMCAsmInfo() {}
-
-extern "C" void LLVMInitializeCBackendMCInstrInfo() {}
-
-extern "C" void LLVMInitializeCBackendMCSubtargetInfo() {}
-
 namespace {
   class CBEMCAsmInfo : public MCAsmInfo {
   public:
@@ -86,6 +82,8 @@ namespace {
     LoopInfo *LI;
     const Module *TheModule;
     const MCAsmInfo* TAsm;
+    const MCRegisterInfo *MRI;
+    const MCObjectFileInfo *MOFI;
     MCContext *TCtx;
     const TargetData* TD;
     
@@ -99,14 +97,14 @@ namespace {
 
     /// UnnamedStructIDs - This contains a unique ID for each struct that is
     /// either anonymous or has no name.
-    DenseMap<const StructType*, unsigned> UnnamedStructIDs;
+    DenseMap<StructType*, unsigned> UnnamedStructIDs;
     
   public:
     static char ID;
     explicit CWriter(formatted_raw_ostream &o)
       : FunctionPass(ID), Out(o), IL(0), Mang(0), LI(0),
-        TheModule(0), TAsm(0), TCtx(0), TD(0), OpaqueCounter(0),
-        NextAnonValueNumber(0) {
+        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
+        OpaqueCounter(0), NextAnonValueNumber(0) {
       initializeLoopInfoPass(*PassRegistry::getPassRegistry());
       FPCounter = 0;
     }
@@ -145,6 +143,8 @@ namespace {
       delete Mang;
       delete TCtx;
       delete TAsm;
+      delete MRI;
+      delete MOFI;
       FPConstantMap.clear();
       ByValParams.clear();
       intrinsicPrototypesAlreadyGenerated.clear();
@@ -152,20 +152,20 @@ namespace {
       return false;
     }
 
-    raw_ostream &printType(raw_ostream &Out, const Type *Ty,
+    raw_ostream &printType(raw_ostream &Out, Type *Ty,
                            bool isSigned = false,
                            const std::string &VariableName = "",
                            bool IgnoreName = false,
                            const AttrListPtr &PAL = AttrListPtr());
-    raw_ostream &printSimpleType(raw_ostream &Out, const Type *Ty,
+    raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty,
                                  bool isSigned,
                                  const std::string &NameSoFar = "");
 
     void printStructReturnPointerFunctionType(raw_ostream &Out,
                                               const AttrListPtr &PAL,
-                                              const PointerType *Ty);
+                                              PointerType *Ty);
 
-    std::string getStructName(const StructType *ST);
+    std::string getStructName(StructType *ST);
     
     /// writeOperandDeref - Print the result of dereferencing the specified
     /// operand with '*'.  This is equivalent to printing '*' then using
@@ -188,7 +188,7 @@ namespace {
     void writeOperandWithCast(Value* Operand, const ICmpInst &I);
     bool writeInstructionCast(const Instruction &I);
 
-    void writeMemoryAccess(Value *Operand, const Type *OperandType,
+    void writeMemoryAccess(Value *Operand, Type *OperandType,
                            bool IsVolatile, unsigned Alignment);
 
   private :
@@ -200,7 +200,7 @@ namespace {
     void printIntrinsicDefinition(const Function &F, raw_ostream &Out);
 
     void printModuleTypes();
-    void printContainedStructs(const Type *Ty, SmallPtrSet<const Type *, 16> &);
+    void printContainedStructs(Type *Ty, SmallPtrSet<Type *, 16> &);
     void printFloatingPointConstants(Function &F);
     void printFloatingPointConstants(const Constant *C);
     void printFunctionSignature(const Function *F, bool Prototype);
@@ -209,7 +209,7 @@ namespace {
     void printBasicBlock(BasicBlock *BB);
     void printLoop(Loop *L);
 
-    void printCast(unsigned opcode, const Type *SrcTy, const Type *DstTy);
+    void printCast(unsigned opcode, Type *SrcTy, Type *DstTy);
     void printConstant(Constant *CPV, bool Static);
     void printConstantWithCast(Constant *CPV, unsigned Opcode);
     bool printConstExprCast(const ConstantExpr *CE, bool Static);
@@ -288,10 +288,12 @@ namespace {
     void visitInvokeInst(InvokeInst &I) {
       llvm_unreachable("Lowerinvoke pass didn't work!");
     }
-
     void visitUnwindInst(UnwindInst &I) {
       llvm_unreachable("Lowerinvoke pass didn't work!");
     }
+    void visitResumeInst(ResumeInst &I) {
+      llvm_unreachable("DwarfEHPrepare pass didn't work!");
+    }
     void visitUnreachableInst(UnreachableInst &I);
 
     void visitPHINode(PHINode &I);
@@ -360,8 +362,8 @@ static std::string CBEMangle(const std::string &S) {
   return Result;
 }
 
-std::string CWriter::getStructName(const StructType *ST) {
-  if (!ST->isAnonymous() && !ST->getName().empty())
+std::string CWriter::getStructName(StructType *ST) {
+  if (!ST->isLiteral() && !ST->getName().empty())
     return CBEMangle("l_"+ST->getName().str());
   
   return "l_unnamed_" + utostr(UnnamedStructIDs[ST]);
@@ -373,20 +375,20 @@ std::string CWriter::getStructName(const StructType *ST) {
 /// print it as "Struct (*)(...)", for struct return functions.
 void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
                                                    const AttrListPtr &PAL,
-                                                   const PointerType *TheTy) {
-  const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
+                                                   PointerType *TheTy) {
+  FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
   std::string tstr;
   raw_string_ostream FunctionInnards(tstr);
   FunctionInnards << " (*) (";
   bool PrintedType = false;
 
   FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
-  const Type *RetTy = cast<PointerType>(*I)->getElementType();
+  Type *RetTy = cast<PointerType>(*I)->getElementType();
   unsigned Idx = 1;
   for (++I, ++Idx; I != E; ++I, ++Idx) {
     if (PrintedType)
       FunctionInnards << ", ";
-    const Type *ArgTy = *I;
+    Type *ArgTy = *I;
     if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
       assert(ArgTy->isPointerTy());
       ArgTy = cast<PointerType>(ArgTy)->getElementType();
@@ -408,7 +410,7 @@ void CWriter::printStructReturnPointerFunctionType(raw_ostream &Out,
 }
 
 raw_ostream &
-CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
+CWriter::printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned,
                          const std::string &NameSoFar) {
   assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) &&
          "Invalid type for printSimpleType");
@@ -444,7 +446,7 @@ CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
                      " __attribute__((vector_size(64))) " + NameSoFar);
 
   case Type::VectorTyID: {
-    const VectorType *VTy = cast<VectorType>(Ty);
+    VectorType *VTy = cast<VectorType>(Ty);
     return printSimpleType(Out, VTy->getElementType(), isSigned,
                      " __attribute__((vector_size(" +
                      utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
@@ -461,7 +463,7 @@ CWriter::printSimpleType(raw_ostream &Out, const Type *Ty, bool isSigned,
 // Pass the Type* and the variable name and this prints out the variable
 // declaration.
 //
-raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
+raw_ostream &CWriter::printType(raw_ostream &Out, Type *Ty,
                                 bool isSigned, const std::string &NameSoFar,
                                 bool IgnoreName, const AttrListPtr &PAL) {
   if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) {
@@ -471,14 +473,14 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
 
   switch (Ty->getTypeID()) {
   case Type::FunctionTyID: {
-    const FunctionType *FTy = cast<FunctionType>(Ty);
+    FunctionType *FTy = cast<FunctionType>(Ty);
     std::string tstr;
     raw_string_ostream FunctionInnards(tstr);
     FunctionInnards << " (" << NameSoFar << ") (";
     unsigned Idx = 1;
     for (FunctionType::param_iterator I = FTy->param_begin(),
            E = FTy->param_end(); I != E; ++I) {
-      const Type *ArgTy = *I;
+      Type *ArgTy = *I;
       if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
         assert(ArgTy->isPointerTy());
         ArgTy = cast<PointerType>(ArgTy)->getElementType();
@@ -502,7 +504,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
     return Out;
   }
   case Type::StructTyID: {
-    const StructType *STy = cast<StructType>(Ty);
+    StructType *STy = cast<StructType>(Ty);
     
     // Check to see if the type is named.
     if (!IgnoreName)
@@ -523,7 +525,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
   }
 
   case Type::PointerTyID: {
-    const PointerType *PTy = cast<PointerType>(Ty);
+    PointerType *PTy = cast<PointerType>(Ty);
     std::string ptrName = "*" + NameSoFar;
 
     if (PTy->getElementType()->isArrayTy() ||
@@ -537,7 +539,7 @@ raw_ostream &CWriter::printType(raw_ostream &Out, const Type *Ty,
   }
 
   case Type::ArrayTyID: {
-    const ArrayType *ATy = cast<ArrayType>(Ty);
+    ArrayType *ATy = cast<ArrayType>(Ty);
     unsigned NumElements = ATy->getNumElements();
     if (NumElements == 0) NumElements = 1;
     // Arrays are wrapped in structs to allow them to have normal
@@ -560,7 +562,7 @@ void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
   // As a special case, print the array as a string if it is an array of
   // ubytes or an array of sbytes with positive values.
   //
-  const Type *ETy = CPA->getType()->getElementType();
+  Type *ETy = CPA->getType()->getElementType();
   bool isString = (ETy == Type::getInt8Ty(CPA->getContext()) ||
                    ETy == Type::getInt8Ty(CPA->getContext()));
 
@@ -682,7 +684,7 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) {
 /// Print out the casting for a cast operation. This does the double casting
 /// necessary for conversion to the destination type, if necessary.
 /// @brief Print a cast
-void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) {
+void CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
   // Print the destination type cast
   switch (opc) {
     case Instruction::UIToFP:
@@ -917,7 +919,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
   }
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
-    const Type* Ty = CI->getType();
+    Type* Ty = CI->getType();
     if (Ty == Type::getInt1Ty(CPV->getContext()))
       Out << (CI->getZExtValue() ? '1' : '0');
     else if (Ty == Type::getInt32Ty(CPV->getContext()))
@@ -1027,7 +1029,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       printConstantArray(CA, Static);
     } else {
       assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      const ArrayType *AT = cast<ArrayType>(CPV->getType());
+      ArrayType *AT = cast<ArrayType>(CPV->getType());
       Out << '{';
       if (AT->getNumElements()) {
         Out << ' ';
@@ -1054,7 +1056,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       printConstantVector(CV, Static);
     } else {
       assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-      const VectorType *VT = cast<VectorType>(CPV->getType());
+      VectorType *VT = cast<VectorType>(CPV->getType());
       Out << "{ ";
       Constant *CZ = Constant::getNullValue(VT->getElementType());
       printConstant(CZ, Static);
@@ -1074,7 +1076,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
       Out << ")";
     }
     if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
-      const StructType *ST = cast<StructType>(CPV->getType());
+      StructType *ST = cast<StructType>(CPV->getType());
       Out << '{';
       if (ST->getNumElements()) {
         Out << ' ';
@@ -1123,7 +1125,7 @@ void CWriter::printConstant(Constant *CPV, bool Static) {
 // care of detecting that case and printing the cast for the ConstantExpr.
 bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
   bool NeedsExplicitCast = false;
-  const Type *Ty = CE->getOperand(0)->getType();
+  Type *Ty = CE->getOperand(0)->getType();
   bool TypeIsSigned = false;
   switch (CE->getOpcode()) {
   case Instruction::Add:
@@ -1175,7 +1177,7 @@ bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
 void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
 
   // Extract the operand's type, we'll need it.
-  const Type* OpTy = CPV->getType();
+  Type* OpTy = CPV->getType();
 
   // Indicate whether to do the cast or not.
   bool shouldCast = false;
@@ -1267,7 +1269,7 @@ std::string CWriter::GetValueName(const Value *Operand) {
 void CWriter::writeInstComputationInline(Instruction &I) {
   // We can't currently support integer types other than 1, 8, 16, 32, 64.
   // Validate this.
-  const Type *Ty = I.getType();
+  Type *Ty = I.getType();
   if (Ty->isIntegerTy() && (Ty!=Type::getInt1Ty(I.getContext()) &&
         Ty!=Type::getInt8Ty(I.getContext()) &&
         Ty!=Type::getInt16Ty(I.getContext()) &&
@@ -1330,7 +1332,7 @@ void CWriter::writeOperand(Value *Operand, bool Static) {
 // This function takes care of detecting that case and printing the cast
 // for the Instruction.
 bool CWriter::writeInstructionCast(const Instruction &I) {
-  const Type *Ty = I.getOperand(0)->getType();
+  Type *Ty = I.getOperand(0)->getType();
   switch (I.getOpcode()) {
   case Instruction::Add:
   case Instruction::Sub:
@@ -1362,7 +1364,7 @@ bool CWriter::writeInstructionCast(const Instruction &I) {
 void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
 
   // Extract the operand's type, we'll need it.
-  const Type* OpTy = Operand->getType();
+  Type* OpTy = Operand->getType();
 
   // Indicate whether to do the cast or not.
   bool shouldCast = false;
@@ -1430,7 +1432,7 @@ void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
   bool castIsSigned = Cmp.isSigned();
 
   // If the operand was a pointer, convert to a large integer type.
-  const Type* OpTy = Operand->getType();
+  Type* OpTy = Operand->getType();
   if (OpTy->isPointerTy())
     OpTy = TD->getIntPtrType(Operand->getContext());
 
@@ -1665,7 +1667,8 @@ bool CWriter::doInitialization(Module &M) {
     TAsm = Match->createMCAsmInfo(Triple);
 #endif
   TAsm = new CBEMCAsmInfo();
-  TCtx = new MCContext(*TAsm, NULL);
+  MRI  = new MCRegisterInfo();
+  TCtx = new MCContext(*TAsm, *MRI, NULL);
   Mang = new Mangler(*TCtx, *TD);
 
   // Keep track of which functions are static ctors/dtors so they can have
@@ -2049,7 +2052,7 @@ void CWriter::printModuleTypes() {
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *ST = StructTypes[i];
 
-    if (ST->isAnonymous() || ST->getName().empty())
+    if (ST->isLiteral() || ST->getName().empty())
       UnnamedStructIDs[ST] = NextTypeID++;
 
     std::string Name = getStructName(ST);
@@ -2060,7 +2063,7 @@ void CWriter::printModuleTypes() {
   Out << '\n';
 
   // Keep track of which structures have been printed so far.
-  SmallPtrSet<const Type *, 16> StructPrinted;
+  SmallPtrSet<Type *, 16> StructPrinted;
 
   // Loop over all structures then push them into the stack so they are
   // printed in the correct order.
@@ -2077,8 +2080,8 @@ void CWriter::printModuleTypes() {
 //
 // TODO:  Make this work properly with vector types
 //
-void CWriter::printContainedStructs(const Type *Ty,
-                                SmallPtrSet<const Type *, 16> &StructPrinted) {
+void CWriter::printContainedStructs(Type *Ty,
+                                SmallPtrSet<Type *, 16> &StructPrinted) {
   // Don't walk through pointers.
   if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy())
     return;
@@ -2088,7 +2091,7 @@ void CWriter::printContainedStructs(const Type *Ty,
        E = Ty->subtype_end(); I != E; ++I)
     printContainedStructs(*I, StructPrinted);
 
-  if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+  if (StructType *ST = dyn_cast<StructType>(Ty)) {
     // Check to see if we have already printed this struct.
     if (!StructPrinted.insert(Ty)) return;
     
@@ -2120,7 +2123,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
   }
 
   // Loop over the arguments, printing them...
-  const FunctionType *FT = cast<FunctionType>(F->getFunctionType());
+  FunctionType *FT = cast<FunctionType>(F->getFunctionType());
   const AttrListPtr &PAL = F->getAttributes();
 
   std::string tstr;
@@ -2150,7 +2153,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
           ArgName = GetValueName(I);
         else
           ArgName = "";
-        const Type *ArgTy = I->getType();
+        Type *ArgTy = I->getType();
         if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
           ArgTy = cast<PointerType>(ArgTy)->getElementType();
           ByValParams.insert(I);
@@ -2177,7 +2180,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
 
     for (; I != E; ++I) {
       if (PrintedArg) FunctionInnards << ", ";
-      const Type *ArgTy = *I;
+      Type *ArgTy = *I;
       if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
         assert(ArgTy->isPointerTy());
         ArgTy = cast<PointerType>(ArgTy)->getElementType();
@@ -2205,7 +2208,7 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
   FunctionInnards << ')';
 
   // Get the return tpe for the function.
-  const Type *RetTy;
+  Type *RetTy;
   if (!isStructReturn)
     RetTy = F->getReturnType();
   else {
@@ -2222,8 +2225,8 @@ void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
 static inline bool isFPIntBitCast(const Instruction &I) {
   if (!isa<BitCastInst>(I))
     return false;
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DstTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DstTy = I.getType();
   return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
          (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
 }
@@ -2237,7 +2240,7 @@ void CWriter::printFunction(Function &F) {
 
   // If this is a struct return function, handle the result with magic.
   if (isStructReturn) {
-    const Type *StructTy =
+    Type *StructTy =
       cast<PointerType>(F.arg_begin()->getType())->getElementType();
     Out << "  ";
     printType(Out, StructTy, false, "StructReturn");
@@ -2380,22 +2383,29 @@ void CWriter::visitReturnInst(ReturnInst &I) {
 
 void CWriter::visitSwitchInst(SwitchInst &SI) {
 
+  Value* Cond = SI.getCondition();
+
   Out << "  switch (";
-  writeOperand(SI.getOperand(0));
+  writeOperand(Cond);
   Out << ") {\n  default:\n";
   printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
   printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
   Out << ";\n";
-  for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2) {
+
+  unsigned NumCases = SI.getNumCases();
+  // Skip the first item since that's the default case.
+  for (unsigned i = 1; i < NumCases; ++i) {
+    ConstantInt* CaseVal = SI.getCaseValue(i);
+    BasicBlock* Succ = SI.getSuccessor(i);
     Out << "  case ";
-    writeOperand(SI.getOperand(i));
+    writeOperand(CaseVal);
     Out << ":\n";
-    BasicBlock *Succ = cast<BasicBlock>(SI.getOperand(i+1));
     printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
     printBranchToBlock(SI.getParent(), Succ, 2);
     if (Function::iterator(Succ) == llvm::next(Function::iterator(SI.getParent())))
       Out << "    break;\n";
   }
+
   Out << "  }\n";
 }
 
@@ -2656,7 +2666,7 @@ void CWriter::visitFCmpInst(FCmpInst &I) {
   Out << ")";
 }
 
-static const char * getFloatBitCastField(const Type *Ty) {
+static const char * getFloatBitCastField(Type *Ty) {
   switch (Ty->getTypeID()) {
     default: llvm_unreachable("Invalid Type");
     case Type::FloatTyID:  return "Float";
@@ -2672,8 +2682,8 @@ static const char * getFloatBitCastField(const Type *Ty) {
 }
 
 void CWriter::visitCastInst(CastInst &I) {
-  const Type *DstTy = I.getType();
-  const Type *SrcTy = I.getOperand(0)->getType();
+  Type *DstTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
   if (isFPIntBitCast(I)) {
     Out << '(';
     // These int<->float and long<->double casts need to be handled specially
@@ -2719,7 +2729,7 @@ void CWriter::visitSelectInst(SelectInst &I) {
 
 // Returns the macro name or value of the max or min of an integer type
 // (as defined in limits.h).
-static void printLimitValue(const IntegerType &Ty, bool isSigned, bool isMax,
+static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax,
                             raw_ostream &Out) {
   const char* type;
   const char* sprefix = "";
@@ -2745,16 +2755,16 @@ static void printLimitValue(const IntegerType &Ty, bool isSigned, bool isMax,
 }
 
 #ifndef NDEBUG
-static bool isSupportedIntegerSize(const IntegerType &T) {
+static bool isSupportedIntegerSize(IntegerType &T) {
   return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
          T.getBitWidth() == 32 || T.getBitWidth() == 64;
 }
 #endif
 
 void CWriter::printIntrinsicDefinition(const Function &F, raw_ostream &Out) {
-  const FunctionType *funT = F.getFunctionType();
-  const Type *retT = F.getReturnType();
-  const IntegerType *elemT = cast<IntegerType>(funT->getParamType(1));
+  FunctionType *funT = F.getFunctionType();
+  Type *retT = F.getReturnType();
+  IntegerType *elemT = cast<IntegerType>(funT->getParamType(1));
 
   assert(isSupportedIntegerSize(*elemT) &&
          "CBackend does not support arbitrary size integers.");
@@ -2829,7 +2839,6 @@ void CWriter::lowerIntrinsics(Function &F) {
         if (Function *F = CI->getCalledFunction())
           switch (F->getIntrinsicID()) {
           case Intrinsic::not_intrinsic:
-          case Intrinsic::memory_barrier:
           case Intrinsic::vastart:
           case Intrinsic::vacopy:
           case Intrinsic::vaend:
@@ -2908,8 +2917,8 @@ void CWriter::visitCallInst(CallInst &I) {
 
   Value *Callee = I.getCalledValue();
 
-  const PointerType  *PTy   = cast<PointerType>(Callee->getType());
-  const FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
+  PointerType  *PTy   = cast<PointerType>(Callee->getType());
+  FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
 
   // If this is a call to a struct-return function, assign to the first
   // parameter instead of passing it to the call.
@@ -3020,9 +3029,6 @@ bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
     WroteCallee = true;
     return false;
   }
-  case Intrinsic::memory_barrier:
-    Out << "__sync_synchronize()";
-    return true;
   case Intrinsic::vastart:
     Out << "0; ";
 
@@ -3217,7 +3223,7 @@ void CWriter::visitInlineAsm(CallInst &CI) {
   std::vector<std::pair<Value*, int> > ResultVals;
   if (CI.getType() == Type::getVoidTy(CI.getContext()))
     ;
-  else if (const StructType *ST = dyn_cast<StructType>(CI.getType())) {
+  else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
       ResultVals.push_back(std::make_pair(&CI, (int)i));
   } else {
@@ -3348,7 +3354,7 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   // Find out if the last index is into a vector.  If so, we have to print this
   // specially.  Since vectors can't have elements of indexable type, only the
   // last index could possibly be of a vector element.
-  const VectorType *LastIndexIsVector = 0;
+  VectorType *LastIndexIsVector = 0;
   {
     for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
       LastIndexIsVector = dyn_cast<VectorType>(*TmpI);
@@ -3421,7 +3427,7 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   Out << ")";
 }
 
-void CWriter::writeMemoryAccess(Value *Operand, const Type *OperandType,
+void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
                                 bool IsVolatile, unsigned Alignment) {
 
   bool IsUnaligned = Alignment &&
@@ -3463,7 +3469,7 @@ void CWriter::visitStoreInst(StoreInst &I) {
   Out << " = ";
   Value *Operand = I.getOperand(0);
   Constant *BitMask = 0;
-  if (const IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
     if (!ITy->isPowerOf2ByteWidth())
       // We have a bit width that doesn't match an even power-of-2 byte
       // size. Consequently we must & the value with the type's bit mask
@@ -3492,7 +3498,7 @@ void CWriter::visitVAArgInst(VAArgInst &I) {
 }
 
 void CWriter::visitInsertElementInst(InsertElementInst &I) {
-  const Type *EltTy = I.getType()->getElementType();
+  Type *EltTy = I.getType()->getElementType();
   writeOperand(I.getOperand(0));
   Out << ";\n  ";
   Out << "((";
@@ -3507,7 +3513,7 @@ void CWriter::visitInsertElementInst(InsertElementInst &I) {
 void CWriter::visitExtractElementInst(ExtractElementInst &I) {
   // We know that our operand is not inlined.
   Out << "((";
-  const Type *EltTy =
+  Type *EltTy =
     cast<VectorType>(I.getOperand(0)->getType())->getElementType();
   printType(Out, PointerType::getUnqual(EltTy));
   Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
@@ -3519,9 +3525,9 @@ void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Out << "(";
   printType(Out, SVI.getType());
   Out << "){ ";
-  const VectorType *VT = SVI.getType();
+  VectorType *VT = SVI.getType();
   unsigned NumElts = VT->getNumElements();
-  const Type *EltTy = VT->getElementType();
+  Type *EltTy = VT->getElementType();
 
   for (unsigned i = 0; i != NumElts; ++i) {
     if (i) Out << ", ";
@@ -3557,9 +3563,9 @@ void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
   Out << GetValueName(&IVI);
   for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
        i != e; ++i) {
-    const Type *IndexedTy =
+    Type *IndexedTy =
       ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
-                                       ArrayRef<unsigned>(b, i+1));
+                                       makeArrayRef(b, i+1));
     if (IndexedTy->isArrayTy())
       Out << ".array[" << *i << "]";
     else
@@ -3579,9 +3585,9 @@ void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
     Out << GetValueName(EVI.getOperand(0));
     for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
          i != e; ++i) {
-      const Type *IndexedTy =
+      Type *IndexedTy =
         ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
-                                         ArrayRef<unsigned>(b, i+1));
+                                         makeArrayRef(b, i+1));
       if (IndexedTy->isArrayTy())
         Out << ".array[" << *i << "]";
       else
diff --git a/lib/Target/CBackend/CMakeLists.txt b/lib/Target/CBackend/CMakeLists.txt
index a23ff85..96ae49f 100644
--- a/lib/Target/CBackend/CMakeLists.txt
+++ b/lib/Target/CBackend/CMakeLists.txt
@@ -2,4 +2,16 @@ add_llvm_target(CBackend
   CBackend.cpp
   )
 
+add_llvm_library_dependencies(LLVMCBackend
+  LLVMAnalysis
+  LLVMCBackendInfo
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMScalarOpts
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
+
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
index e64216b..4f1ca97 100644
--- a/lib/Target/CBackend/CTargetMachine.h
+++ b/lib/Target/CBackend/CTargetMachine.h
@@ -20,8 +20,9 @@
 namespace llvm {
 
 struct CTargetMachine : public TargetMachine {
-  CTargetMachine(const Target &T, const std::string &TT,
-                 const std::string &CPU, const std::string &FS)
+  CTargetMachine(const Target &T, StringRef TT,
+                 StringRef CPU, StringRef FS,
+                 Reloc::Model RM, CodeModel::Model CM)
     : TargetMachine(T, TT, CPU, FS) {}
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
diff --git a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
index f7e8ff2..e8274ff 100644
--- a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
+++ b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "CTargetMachine.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheCBackendTarget;
@@ -17,3 +17,5 @@ Target llvm::TheCBackendTarget;
 extern "C" void LLVMInitializeCBackendTargetInfo() { 
   RegisterTarget<> X(TheCBackendTarget, "c", "C backend");
 }
+
+extern "C" void LLVMInitializeCBackendTargetMC() {}
diff --git a/lib/Target/CBackend/TargetInfo/CMakeLists.txt b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
index 5b35fa7..8e616be 100644
--- a/lib/Target/CBackend/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
@@ -4,3 +4,8 @@ add_llvm_library(LLVMCBackendInfo
   CBackendTargetInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMCBackendInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index f982316..030f808 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1,8 +1,6 @@
 add_llvm_library(LLVMTarget
   Mangler.cpp
   Target.cpp
-  TargetAsmInfo.cpp
-  TargetAsmLexer.cpp
   TargetData.cpp
   TargetELFWriterInfo.cpp
   TargetFrameLowering.cpp
@@ -15,6 +13,12 @@ add_llvm_library(LLVMTarget
   TargetSubtargetInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMTarget
+  LLVMCore
+  LLVMMC
+  LLVMSupport
+  )
+
 set(LLVM_ENUM_ASM_PRINTERS "")
 set(LLVM_ENUM_ASM_PARSERS "")
 set(LLVM_ENUM_DISASSEMBLERS "")
diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
index 0b94e0c..158fb3e 100644
--- a/lib/Target/CellSPU/CMakeLists.txt
+++ b/lib/Target/CellSPU/CMakeLists.txt
@@ -1,12 +1,13 @@
 set(LLVM_TARGET_DEFINITIONS SPU.td)
 
-tablegen(SPUGenAsmWriter.inc -gen-asm-writer)
-tablegen(SPUGenCodeEmitter.inc -gen-emitter)
-tablegen(SPUGenRegisterInfo.inc -gen-register-info)
-tablegen(SPUGenInstrInfo.inc -gen-instr-info)
-tablegen(SPUGenDAGISel.inc -gen-dag-isel)
-tablegen(SPUGenSubtargetInfo.inc -gen-subtarget)
-tablegen(SPUGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(SPUGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(SPUGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(SPUGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(SPUGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(SPUGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(SPUGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(SPUGenCallingConv.inc -gen-callingconv)
+add_public_tablegen_target(CellSPUCommonTableGen)
 
 add_llvm_target(CellSPUCodeGen
   SPUAsmPrinter.cpp
@@ -22,5 +23,17 @@ add_llvm_target(CellSPUCodeGen
   SPUNopFiller.cpp
   )
 
+add_llvm_library_dependencies(LLVMCellSPUCodeGen
+  LLVMAsmPrinter
+  LLVMCellSPUDesc
+  LLVMCellSPUInfo
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt b/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
index 85fb258..d41fe93 100644
--- a/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/CellSPU/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,10 @@ add_llvm_library(LLVMCellSPUDesc
   SPUMCTargetDesc.cpp
   SPUMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMCellSPUDesc
+  LLVMCellSPUInfo
+  LLVMMC
+  )
+
+add_dependencies(LLVMCellSPUDesc CellSPUCommonTableGen)
diff --git a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
index 26c5a4b..d5af2a8 100644
--- a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
+++ b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
@@ -13,10 +13,12 @@
 
 #include "SPUMCTargetDesc.h"
 #include "SPUMCAsmInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "SPUGenInstrInfo.inc"
@@ -35,8 +37,10 @@ static MCInstrInfo *createSPUMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeCellSPUMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheCellSPUTarget, createSPUMCInstrInfo);
+static MCRegisterInfo *createCellSPUMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSPUMCRegisterInfo(X, SPU::R0);
+  return X;
 }
 
 static MCSubtargetInfo *createSPUMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -46,11 +50,43 @@ static MCSubtargetInfo *createSPUMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeCellSPUMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheCellSPUTarget,
-                                          createSPUMCSubtargetInfo);
+static MCAsmInfo *createSPUMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new SPULinuxMCAsmInfo(T, TT);
+
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(SPU::R1, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createSPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                      CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  X->InitMCCodeGenInfo(Reloc::Static, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeCellSPUMCAsmInfo() {
-  RegisterMCAsmInfo<SPULinuxMCAsmInfo> X(TheCellSPUTarget);
+// Force static initialization.
+extern "C" void LLVMInitializeCellSPUTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheCellSPUTarget, createSPUMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheCellSPUTarget,
+                                        createSPUMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheCellSPUTarget, createSPUMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheCellSPUTarget,
+                                    createCellSPUMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheCellSPUTarget,
+                                          createSPUMCSubtargetInfo);
 }
diff --git a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
index c5c037d..a3717b0 100644
--- a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
+++ b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- SPUMCTargetDesc.h - Alpha Target Descriptions ---------*- C++ -*-===//
+//===-- SPUMCTargetDesc.h - CellSPU Target Descriptions ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides Alpha specific target descriptions.
+// This file provides CellSPU specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index fd96694..90b5270 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -29,10 +29,10 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/CellSPU/SPUFrameLowering.cpp b/lib/Target/CellSPU/SPUFrameLowering.cpp
index a3e7e73..093f99f 100644
--- a/lib/Target/CellSPU/SPUFrameLowering.cpp
+++ b/lib/Target/CellSPU/SPUFrameLowering.cpp
@@ -181,18 +181,6 @@ void SPUFrameLowering::emitPrologue(MachineFunction &MF) const {
       MachineLocation FPSrc(MachineLocation::VirtualFP);
       Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc));
     }
-  } else {
-    // This is a leaf function -- insert a branch hint iff there are
-    // sufficient number instructions in the basic block. Note that
-    // this is just a best guess based on the basic block's size.
-    if (MBB.size() >= (unsigned) SPUFrameLowering::branchHintPenalty()) {
-      MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-      dl = MBBI->getDebugLoc();
-
-      // Insert terminator label
-      BuildMI(MBB, MBBI, dl, TII.get(SPU::PROLOG_LABEL))
-        .addSym(MMI.getContext().CreateTempSymbol());
-    }
   }
 }
 
@@ -249,14 +237,6 @@ void SPUFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void SPUFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves)
-                                                                         const {
-  // Initial state of the frame pointer is R1.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(SPU::R1, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 void SPUFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                         RegScavenger *RS) const{
   // Mark LR and SP unused, since the prolog spills them to stack and
diff --git a/lib/Target/CellSPU/SPUFrameLowering.h b/lib/Target/CellSPU/SPUFrameLowering.h
index 4fee72d..b837f2c 100644
--- a/lib/Target/CellSPU/SPUFrameLowering.h
+++ b/lib/Target/CellSPU/SPUFrameLowering.h
@@ -43,9 +43,6 @@ namespace llvm {
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                               RegScavenger *RS = NULL) const;
 
-    //! Perform target-specific stack frame setup.
-    void getInitialFrameState(std::vector<MachineMove> &Moves) const;
-
     //! Return a function's saved spill slots
     /*!
       For CellSPU, a function's saved spill slots is just the link register.
@@ -77,17 +74,6 @@ namespace llvm {
     static int FItoStackOffset(int frame_index) {
       return frame_index * stackSlotSize();
     }
-    //! Number of instructions required to overcome hint-for-branch latency
-    /*!
-      HBR (hint-for-branch) instructions can be inserted when, for example,
-      we know that a given function is going to be called, such as printf(),
-      in the control flow graph. HBRs are only inserted if a sufficient number
-      of instructions occurs between the HBR and the target. Currently, HBRs
-      take 6 cycles, ergo, the magic number 6.
-     */
-    static int branchHintPenalty() {
-      return 6;
-    }
   };
 }
 
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index f0ceee2..ac33111 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -69,7 +69,7 @@ namespace {
     TargetLowering::ArgListEntry Entry;
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
       EVT ArgVT = Op.getOperand(i).getValueType();
-      const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+      Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
       Entry.Node = Op.getOperand(i);
       Entry.Ty = ArgTy;
       Entry.isSExt = isSigned;
@@ -80,7 +80,7 @@ namespace {
                                            TLI.getPointerTy());
 
     // Splice the libcall in wherever FindInputOutputChains tells us to.
-    const Type *RetTy =
+    Type *RetTy =
                 Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
     std::pair<SDValue, SDValue> CallInfo =
             TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
@@ -174,6 +174,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
 
   // SPU has no intrinsics for these particular operations:
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
 
   // SPU has no division/remainder instructions
   setOperationAction(ISD::SREM,    MVT::i8,   Expand);
@@ -401,6 +402,9 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
        i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
     MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
 
+    // Set operation actions to legal types only.
+    if (!isTypeLegal(VT)) continue;
+
     // add/sub are legal for all supported vector VT's.
     setOperationAction(ISD::ADD,     VT, Legal);
     setOperationAction(ISD::SUB,     VT, Legal);
@@ -438,6 +442,7 @@ SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
   setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // FIXME: Is this correct?
 
   setStackPointerRegisterToSaveRestore(SPU::R1);
 
@@ -497,7 +502,7 @@ SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
 // Return the Cell SPU's SETCC result type
 //===----------------------------------------------------------------------===//
 
-MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
+EVT SPUTargetLowering::getSetCCResultType(EVT VT) const {
   // i8, i16 and i32 are valid SETCC result types
   MVT::SimpleValueType retval;
 
@@ -2727,6 +2732,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
   // the type to extend from needs to be i64 or i32.
   assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
           "LowerSIGN_EXTEND: input and/or output operand have wrong size");
+  (void)OpVT;
 
   // Create shuffle mask
   unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
@@ -3216,7 +3222,7 @@ SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 /// isLegalAddressImmediate - Return true if the integer value can be used
 /// as the offset of the target addressing mode.
 bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
-                                                const Type *Ty) const {
+                                                Type *Ty) const {
   // SPU's addresses are 256K:
   return (V > -(1 << 18) && V < (1 << 18) - 1);
 }
@@ -3239,7 +3245,7 @@ bool SPUTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
 
 bool
 SPUTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                         const Type * ) const{
+                                         Type * ) const{
 
   // A-form: 18bit absolute address.
   if (AM.BaseGV && !AM.HasBaseReg && AM.Scale == 0 && AM.BaseOffs == 0)
diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
index d23f6cc..aa4a168 100644
--- a/lib/Target/CellSPU/SPUISelLowering.h
+++ b/lib/Target/CellSPU/SPUISelLowering.h
@@ -107,7 +107,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the ValueType for ISD::SETCC
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
@@ -147,7 +147,7 @@ namespace llvm {
 
     /// isLegalAddressImmediate - Return true if the integer value can be used
     /// as the offset of the target addressing mode.
-    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
     virtual bool isLegalAddressImmediate(GlobalValue *) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
@@ -179,7 +179,7 @@ namespace llvm {
     virtual bool isLegalICmpImmediate(int64_t Imm) const;
 
     virtual bool isLegalAddressingMode(const AddrMode &AM,
-                                       const Type *Ty) const;
+                                       Type *Ty) const;
   };
 }
 
diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
index e67b10c..007bc0e 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.cpp
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp
@@ -17,9 +17,9 @@
 #include "SPUHazardRecognizers.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRINFO_CTOR
@@ -290,6 +290,8 @@ static void removeHBR( MachineBasicBlock &MBB) {
     if (I->getOpcode() == SPU::HBRA ||
         I->getOpcode() == SPU::HBR_LABEL){
       I=MBB.erase(I);
+      if (I == MBB.end())
+        break;
     }
   }
 }
diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
index e103c9b..f76ebd7 100644
--- a/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/lib/Target/CellSPU/SPUInstrInfo.td
@@ -1594,8 +1594,8 @@ multiclass BitwiseOrImm
 {
   def v4i32: ORIVecInst<v4i32, v4i32Uns10Imm>;
 
-  def r32: ORIInst<(outs R32C:$rT), (ins R32C:$rA, u10imm_i32:$val),
-                   [(set R32C:$rT, (or R32C:$rA, i32ImmUns10:$val))]>;
+  def r32: ORIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                   [(set R32C:$rT, (or R32C:$rA, i32ImmSExt10:$val))]>;
 
   // i16i32: hacked version of the ori instruction to extend 16-bit quantities
   // to 32-bit quantities. used exclusively to match "anyext" conversions (vide
@@ -3467,8 +3467,10 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
         [/* no pattern */]>;
 
     // Indirect branch
-    def BI:
-      BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
+    let isIndirectBranch = 1 in {
+      def BI:
+        BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
+    }
   }
 
   // Conditional branches:
diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
index 19896c0..bbac6fd 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.cpp
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -187,7 +186,7 @@ unsigned SPURegisterInfo::getRegisterNumbering(unsigned RegEnum) {
 
 SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
                                  const TargetInstrInfo &tii) :
-  SPUGenRegisterInfo(), Subtarget(subtarget), TII(tii)
+  SPUGenRegisterInfo(SPU::R0), Subtarget(subtarget), TII(tii)
 {
 }
 
@@ -311,28 +310,12 @@ SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 }
 
 unsigned
-SPURegisterInfo::getRARegister() const
-{
-  return SPU::R0;
-}
-
-unsigned
 SPURegisterInfo::getFrameRegister(const MachineFunction &MF) const
 {
   return SPU::R1;
 }
 
 int
-SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int SPURegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
-  return SPUGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
-}
-
-int
 SPURegisterInfo::convertDFormToXForm(int dFormOpcode) const
 {
   switch(dFormOpcode)
diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
index 5e014f8..b7818a4 100644
--- a/lib/Target/CellSPU/SPURegisterInfo.h
+++ b/lib/Target/CellSPU/SPURegisterInfo.h
@@ -74,8 +74,6 @@ namespace llvm {
     void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                              RegScavenger *RS = NULL) const;
 
-    //! Get return address register (LR, aka R0)
-    unsigned getRARegister() const;
     //! Get the stack frame register (SP, aka R1)
     unsigned getFrameRegister(const MachineFunction &MF) const;
 
@@ -83,10 +81,6 @@ namespace llvm {
     // New methods added:
     //------------------------------------------------------------------------
 
-    //! Get DWARF debugging register number
-    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-    int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
     //! Convert D-form load/store to X-form load/store
     /*!
       Converts a regiser displacement load/store into a register-indexed
diff --git a/lib/Target/CellSPU/SPUSubtarget.cpp b/lib/Target/CellSPU/SPUSubtarget.cpp
index 856dc82..43335ab 100644
--- a/lib/Target/CellSPU/SPUSubtarget.cpp
+++ b/lib/Target/CellSPU/SPUSubtarget.cpp
@@ -14,7 +14,7 @@
 #include "SPUSubtarget.h"
 #include "SPU.h"
 #include "SPURegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/SmallVector.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 3542a2b..93a7f6e 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -16,7 +16,8 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
@@ -31,9 +32,10 @@ SPUFrameLowering::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
   return &LR[0];
 }
 
-SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     DataLayout(Subtarget.getTargetDataString()),
     InstrInfo(*this),
@@ -41,9 +43,6 @@ SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT,
     TLInfo(*this),
     TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()) {
-  // For the time being, use static relocations, since there's really no
-  // support for PIC yet.
-  setRelocationModel(Reloc::Static);
 }
 
 //===----------------------------------------------------------------------===//
@@ -59,8 +58,16 @@ bool SPUTargetMachine::addInstSelector(PassManagerBase &PM,
 
 // passes to run just before printing the assembly
 bool SPUTargetMachine::
-addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) 
-{
+addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+  // load the TCE instruction scheduler, if available via
+  // loaded plugins
+  typedef llvm::FunctionPass* (*BuilderFunc)(const char*);
+  BuilderFunc schedulerCreator =
+    (BuilderFunc)(intptr_t)sys::DynamicLibrary::SearchForAddressOfSymbol(
+          "createTCESchedulerPass");
+  if (schedulerCreator != NULL)
+      PM.add(schedulerCreator("cellspu"));
+
   //align instructions with nops/lnops for dual issue
   PM.add(createSPUNopFillerPass(*this));
   return true;
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index d96f86d..fffe77c 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -38,8 +38,9 @@ class SPUTargetMachine : public LLVMTargetMachine {
   SPUSelectionDAGInfo TSInfo;
   InstrItineraryData  InstrItins;
 public:
-  SPUTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS);
+  SPUTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM);
 
   /// Return the subtarget implementation object
   virtual const SPUSubtarget     *getSubtargetImpl() const {
diff --git a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
index 928d0fe..3f2d6b0 100644
--- a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMCellSPUInfo
   CellSPUTargetInfo.cpp
   )
 
-add_dependencies(LLVMCellSPUInfo CellSPUCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMCellSPUInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMCellSPUInfo CellSPUCommonTableGen)
diff --git a/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
index 049ea23..84aadfa 100644
--- a/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
+++ b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "SPU.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheCellSPUTarget;
diff --git a/lib/Target/CppBackend/CMakeLists.txt b/lib/Target/CppBackend/CMakeLists.txt
index e937559..95b6058 100644
--- a/lib/Target/CppBackend/CMakeLists.txt
+++ b/lib/Target/CppBackend/CMakeLists.txt
@@ -2,4 +2,11 @@ add_llvm_target(CppBackend
   CPPBackend.cpp
   )
 
+add_llvm_library_dependencies(LLVMCppBackend
+  LLVMCore
+  LLVMCppBackendInfo
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 10d18f6..ae0e3c4 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -29,7 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include <algorithm>
@@ -77,22 +77,12 @@ extern "C" void LLVMInitializeCppBackendTarget() {
   RegisterTargetMachine<CPPTargetMachine> X(TheCppBackendTarget);
 }
 
-extern "C" void LLVMInitializeCppBackendMCAsmInfo() {}
-
-extern "C" void LLVMInitializeCppBackendMCInstrInfo() {
-  RegisterMCInstrInfo<MCInstrInfo> X(TheCppBackendTarget);
-}
-
-extern "C" void LLVMInitializeCppBackendMCSubtargetInfo() {
-  RegisterMCSubtargetInfo<MCSubtargetInfo> X(TheCppBackendTarget);
-}
-
 namespace {
-  typedef std::vector<const Type*> TypeList;
-  typedef std::map<const Type*,std::string> TypeMap;
+  typedef std::vector<Type*> TypeList;
+  typedef std::map<Type*,std::string> TypeMap;
   typedef std::map<const Value*,std::string> ValueMap;
   typedef std::set<std::string> NameSet;
-  typedef std::set<const Type*> TypeSet;
+  typedef std::set<Type*> TypeSet;
   typedef std::set<const Value*> ValueSet;
   typedef std::map<const Value*,std::string> ForwardRefMap;
 
@@ -143,14 +133,14 @@ namespace {
     void printEscapedString(const std::string& str);
     void printCFP(const ConstantFP* CFP);
 
-    std::string getCppName(const Type* val);
-    inline void printCppName(const Type* val);
+    std::string getCppName(Type* val);
+    inline void printCppName(Type* val);
 
     std::string getCppName(const Value* val);
     inline void printCppName(const Value* val);
 
     void printAttributes(const AttrListPtr &PAL, const std::string &name);
-    void printType(const Type* Ty);
+    void printType(Type* Ty);
     void printTypes(const Module* M);
 
     void printConstant(const Constant *CPV);
@@ -164,7 +154,7 @@ namespace {
     void printFunctionHead(const Function *F);
     void printFunctionBody(const Function *F);
     void printInstruction(const Instruction *I, const std::string& bbname);
-    std::string getOpName(Value*);
+    std::string getOpName(const Value*);
 
     void printModuleBody();
   };
@@ -184,7 +174,7 @@ static inline void sanitize(std::string &str) {
       str[i] = '_';
 }
 
-static std::string getTypePrefix(const Type *Ty) {
+static std::string getTypePrefix(Type *Ty) {
   switch (Ty->getTypeID()) {
   case Type::VoidTyID:     return "void_";
   case Type::IntegerTyID:
@@ -339,7 +329,7 @@ void CppWriter::printEscapedString(const std::string &Str) {
   }
 }
 
-std::string CppWriter::getCppName(const Type* Ty) {
+std::string CppWriter::getCppName(Type* Ty) {
   // First, handle the primitive types .. easy
   if (Ty->isPrimitiveType() || Ty->isIntegerTy()) {
     switch (Ty->getTypeID()) {
@@ -379,7 +369,7 @@ std::string CppWriter::getCppName(const Type* Ty) {
 
   // See if the type has a name in the symboltable and build accordingly
   std::string name;
-  if (const StructType *STy = dyn_cast<StructType>(Ty))
+  if (StructType *STy = dyn_cast<StructType>(Ty))
     if (STy->hasName())
       name = STy->getName();
   
@@ -393,7 +383,7 @@ std::string CppWriter::getCppName(const Type* Ty) {
   return TypeNames[Ty] = name;
 }
 
-void CppWriter::printCppName(const Type* Ty) {
+void CppWriter::printCppName(Type* Ty) {
   printEscapedString(getCppName(Ty));
 }
 
@@ -480,6 +470,9 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
       HANDLE_ATTR(NoImplicitFloat);
       HANDLE_ATTR(Naked);
       HANDLE_ATTR(InlineHint);
+      HANDLE_ATTR(ReturnsTwice);
+      HANDLE_ATTR(UWTable);
+      HANDLE_ATTR(NonLazyBind);
 #undef HANDLE_ATTR
       if (attrs & Attribute::StackAlignment)
         Out << " | Attribute::constructStackAlignmentFromInt("
@@ -499,7 +492,7 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
   }
 }
 
-void CppWriter::printType(const Type* Ty) {
+void CppWriter::printType(Type* Ty) {
   // We don't print definitions for primitive types
   if (Ty->isPrimitiveType() || Ty->isIntegerTy())
     return;
@@ -514,13 +507,13 @@ void CppWriter::printType(const Type* Ty) {
   // Print the type definition
   switch (Ty->getTypeID()) {
   case Type::FunctionTyID:  {
-    const FunctionType* FT = cast<FunctionType>(Ty);
+    FunctionType* FT = cast<FunctionType>(Ty);
     Out << "std::vector<Type*>" << typeName << "_args;";
     nl(Out);
     FunctionType::param_iterator PI = FT->param_begin();
     FunctionType::param_iterator PE = FT->param_end();
     for (; PI != PE; ++PI) {
-      const Type* argTy = static_cast<const Type*>(*PI);
+      Type* argTy = static_cast<Type*>(*PI);
       printType(argTy);
       std::string argName(getCppName(argTy));
       Out << typeName << "_args.push_back(" << argName;
@@ -539,13 +532,21 @@ void CppWriter::printType(const Type* Ty) {
     break;
   }
   case Type::StructTyID: {
-    const StructType* ST = cast<StructType>(Ty);
-    if (!ST->isAnonymous()) {
-      Out << "StructType *" << typeName << " = ";
-      Out << "StructType::createNamed(mod->getContext(), \"";
+    StructType* ST = cast<StructType>(Ty);
+    if (!ST->isLiteral()) {
+      Out << "StructType *" << typeName << " = mod->getTypeByName(\"";
+      printEscapedString(ST->getName());
+      Out << "\");";
+      nl(Out);
+      Out << "if (!" << typeName << ") {";
+      nl(Out);
+      Out << typeName << " = ";
+      Out << "StructType::create(mod->getContext(), \"";
       printEscapedString(ST->getName());
       Out << "\");";
       nl(Out);
+      Out << "}";
+      nl(Out);
       // Indicate that this type is now defined.
       DefinedTypes.insert(Ty);
     }
@@ -555,7 +556,7 @@ void CppWriter::printType(const Type* Ty) {
     StructType::element_iterator EI = ST->element_begin();
     StructType::element_iterator EE = ST->element_end();
     for (; EI != EE; ++EI) {
-      const Type* fieldTy = static_cast<const Type*>(*EI);
+      Type* fieldTy = static_cast<Type*>(*EI);
       printType(fieldTy);
       std::string fieldName(getCppName(fieldTy));
       Out << typeName << "_fields.push_back(" << fieldName;
@@ -563,21 +564,27 @@ void CppWriter::printType(const Type* Ty) {
       nl(Out);
     }
 
-    if (ST->isAnonymous()) {
+    if (ST->isLiteral()) {
       Out << "StructType *" << typeName << " = ";
       Out << "StructType::get(" << "mod->getContext(), ";
     } else {
+      Out << "if (" << typeName << "->isOpaque()) {";
+      nl(Out);
       Out << typeName << "->setBody(";
     }
 
     Out << typeName << "_fields, /*isPacked=*/"
         << (ST->isPacked() ? "true" : "false") << ");";
     nl(Out);
+    if (!ST->isLiteral()) {
+      Out << "}";
+      nl(Out);
+    }
     break;
   }
   case Type::ArrayTyID: {
-    const ArrayType* AT = cast<ArrayType>(Ty);
-    const Type* ET = AT->getElementType();
+    ArrayType* AT = cast<ArrayType>(Ty);
+    Type* ET = AT->getElementType();
     printType(ET);
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
@@ -589,8 +596,8 @@ void CppWriter::printType(const Type* Ty) {
     break;
   }
   case Type::PointerTyID: {
-    const PointerType* PT = cast<PointerType>(Ty);
-    const Type* ET = PT->getElementType();
+    PointerType* PT = cast<PointerType>(Ty);
+    Type* ET = PT->getElementType();
     printType(ET);
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
@@ -602,8 +609,8 @@ void CppWriter::printType(const Type* Ty) {
     break;
   }
   case Type::VectorTyID: {
-    const VectorType* PT = cast<VectorType>(Ty);
-    const Type* ET = PT->getElementType();
+    VectorType* PT = cast<VectorType>(Ty);
+    Type* ET = PT->getElementType();
     printType(ET);
     if (DefinedTypes.find(Ty) == DefinedTypes.end()) {
       std::string elemName(getCppName(ET));
@@ -766,9 +773,7 @@ void CppWriter::printConstant(const Constant *CV) {
       Out << "Constant* " << constName
           << " = ConstantExpr::getGetElementPtr("
           << getCppName(CE->getOperand(0)) << ", "
-          << "&" << constName << "_indices[0], "
-          << constName << "_indices.size()"
-          << ");";
+          << constName << "_indices);";
     } else if (CE->isCast()) {
       printConstant(CE->getOperand(0));
       Out << "Constant* " << constName << " = ConstantExpr::getCast(";
@@ -988,7 +993,7 @@ void CppWriter::printVariableBody(const GlobalVariable *GV) {
   }
 }
 
-std::string CppWriter::getOpName(Value* V) {
+std::string CppWriter::getOpName(const Value* V) {
   if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
     return getCppName(V);
 
@@ -1053,14 +1058,17 @@ void CppWriter::printInstruction(const Instruction *I,
   case Instruction::Switch: {
     const SwitchInst *SI = cast<SwitchInst>(I);
     Out << "SwitchInst* " << iName << " = SwitchInst::Create("
-        << opNames[0] << ", "
-        << opNames[1] << ", "
+        << getOpName(SI->getCondition()) << ", "
+        << getOpName(SI->getDefaultDest()) << ", "
         << SI->getNumCases() << ", " << bbname << ");";
     nl(Out);
-    for (unsigned i = 2; i != SI->getNumOperands(); i += 2) {
+    unsigned NumCases = SI->getNumCases();
+    for (unsigned i = 1; i < NumCases; ++i) {
+      const ConstantInt* CaseVal = SI->getCaseValue(i);
+      const BasicBlock* BB = SI->getSuccessor(i);
       Out << iName << "->addCase("
-          << opNames[i] << ", "
-          << opNames[i+1] << ");";
+          << getOpName(CaseVal) << ", "
+          << getOpName(BB) << ");";
       nl(Out);
     }
     break;
@@ -1076,6 +1084,11 @@ void CppWriter::printInstruction(const Instruction *I,
     }
     break;
   }
+  case Instruction::Resume: {
+    Out << "ResumeInst::Create(mod->getContext(), " << opNames[0]
+        << ", " << bbname << ");";
+    break;
+  }
   case Instruction::Invoke: {
     const InvokeInst* inv = cast<InvokeInst>(I);
     Out << "std::vector<Value*> " << iName << "_params;";
@@ -1090,8 +1103,7 @@ void CppWriter::printInstruction(const Instruction *I,
         << getOpName(inv->getCalledFunction()) << ", "
         << getOpName(inv->getNormalDest()) << ", "
         << getOpName(inv->getUnwindDest()) << ", "
-        << iName << "_params.begin(), "
-        << iName << "_params.end(), \"";
+        << iName << "_params, \"";
     printEscapedString(inv->getName());
     Out << "\", " << bbname << ");";
     nl(Out) << iName << "->setCallingConv(";
@@ -1252,8 +1264,7 @@ void CppWriter::printInstruction(const Instruction *I,
         nl(Out);
       }
       Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
-          << opNames[0] << ", " << iName << "_indices.begin(), "
-          << iName << "_indices.end()";
+          << opNames[0] << ", " << iName << "_indices";
     }
     Out << ", \"";
     printEscapedString(gep->getName());
@@ -1304,7 +1315,7 @@ void CppWriter::printInstruction(const Instruction *I,
     case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
     case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
     case Instruction::BitCast:  Out << "BitCastInst"; break;
-    default: assert(!"Unreachable"); break;
+    default: assert(0 && "Unreachable"); break;
     }
     Out << "(" << opNames[0] << ", "
         << getCppName(cst->getType()) << ", \"";
@@ -1331,8 +1342,7 @@ void CppWriter::printInstruction(const Instruction *I,
       }
       Out << "CallInst* " << iName << " = CallInst::Create("
           << opNames[call->getNumArgOperands()] << ", "
-          << iName << "_params.begin(), "
-          << iName << "_params.end(), \"";
+          << iName << "_params, \"";
     } else if (call->getNumArgOperands() == 1) {
       Out << "CallInst* " << iName << " = CallInst::Create("
           << opNames[call->getNumArgOperands()] << ", " << opNames[0] << ", \"";
@@ -1415,7 +1425,7 @@ void CppWriter::printInstruction(const Instruction *I,
     Out << "ExtractValueInst* " << getCppName(evi)
         << " = ExtractValueInst::Create(" << opNames[0]
         << ", "
-        << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+        << iName << "_indices, \"";
     printEscapedString(evi->getName());
     Out << "\", " << bbname << ");";
     break;
@@ -1432,7 +1442,7 @@ void CppWriter::printInstruction(const Instruction *I,
     Out << "InsertValueInst* " << getCppName(ivi)
         << " = InsertValueInst::Create(" << opNames[0]
         << ", " << opNames[1] << ", "
-        << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+        << iName << "_indices, \"";
     printEscapedString(ivi->getName());
     Out << "\", " << bbname << ");";
     break;
@@ -1542,13 +1552,12 @@ void CppWriter::printFunctionUses(const Function* F) {
 
 void CppWriter::printFunctionHead(const Function* F) {
   nl(Out) << "Function* " << getCppName(F);
-  if (is_inline) {
-    Out << " = mod->getFunction(\"";
-    printEscapedString(F->getName());
-    Out << "\", " << getCppName(F->getFunctionType()) << ");";
-    nl(Out) << "if (!" << getCppName(F) << ") {";
-    nl(Out) << getCppName(F);
-  }
+  Out << " = mod->getFunction(\"";
+  printEscapedString(F->getName());
+  Out << "\");";
+  nl(Out) << "if (!" << getCppName(F) << ") {";
+  nl(Out) << getCppName(F);
+
   Out<< " = Function::Create(";
   nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
   nl(Out) << "/*Linkage=*/";
@@ -1585,10 +1594,8 @@ void CppWriter::printFunctionHead(const Function* F) {
     Out << "->setGC(\"" << F->getGC() << "\");";
     nl(Out);
   }
-  if (is_inline) {
-    Out << "}";
-    nl(Out);
-  }
+  Out << "}";
+  nl(Out);
   printAttributes(F->getAttributes(), getCppName(F));
   printCppName(F);
   Out << "->setAttributes(" << getCppName(F) << "_PAL);";
@@ -1873,7 +1880,7 @@ void CppWriter::printVariable(const std::string& fname,
 
 void CppWriter::printType(const std::string &fname,
                           const std::string &typeName) {
-  const Type* Ty = TheModule->getTypeByName(typeName);
+  Type* Ty = TheModule->getTypeByName(typeName);
   if (!Ty) {
     error(std::string("Type '") + typeName + "' not found in input module");
     return;
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 7322e3e..287e537 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -22,8 +22,9 @@ namespace llvm {
 class formatted_raw_ostream;
 
 struct CPPTargetMachine : public TargetMachine {
-  CPPTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS)
+  CPPTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM)
     : TargetMachine(T, TT, CPU, FS) {}
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
diff --git a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
index edaf5d3..7165d8f 100644
--- a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
+++ b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
@@ -4,3 +4,7 @@ add_llvm_library(LLVMCppBackendInfo
   CppBackendTargetInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMCppBackendInfo
+  LLVMMC
+  LLVMTarget
+  )
diff --git a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
index d0aeb12..a8ac0a2 100644
--- a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
+++ b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "CPPTargetMachine.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheCppBackendTarget;
@@ -24,3 +24,5 @@ extern "C" void LLVMInitializeCppBackendTargetInfo() {
                                   "C++ backend",
                                   &CppBackend_TripleMatchQuality);
 }
+
+extern "C" void LLVMInitializeCppBackendTargetMC() {}
diff --git a/lib/Target/MBlaze/AsmParser/CMakeLists.txt b/lib/Target/MBlaze/AsmParser/CMakeLists.txt
index 87e7cb5..ec8f52a 100644
--- a/lib/Target/MBlaze/AsmParser/CMakeLists.txt
+++ b/lib/Target/MBlaze/AsmParser/CMakeLists.txt
@@ -6,3 +6,11 @@ add_llvm_library(LLVMMBlazeAsmParser
   MBlazeAsmParser.cpp
   )
 
+add_llvm_library_dependencies(LLVMMBlazeAsmParser
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMBlazeAsmParser MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
index 1596596..2d357bb 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmLexer.cpp
@@ -7,8 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MBlaze.h"
-#include "MBlazeTargetMachine.h"
+#include "MCTargetDesc/MBlazeBaseInfo.h"
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -17,10 +16,10 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
 
-#include "llvm/Target/TargetAsmLexer.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #include <string>
 #include <map>
@@ -29,7 +28,7 @@ using namespace llvm;
 
 namespace {
   
-  class MBlazeBaseAsmLexer : public TargetAsmLexer {
+  class MBlazeBaseAsmLexer : public MCTargetAsmLexer {
     const MCAsmInfo &AsmInfo;
     
     const AsmToken &lexDefinite() {
@@ -42,7 +41,7 @@ namespace {
     
     rmap_ty RegisterMap;
     
-    void InitRegisterMap(const TargetRegisterInfo *info) {
+    void InitRegisterMap(const MCRegisterInfo *info) {
       unsigned numRegs = info->getNumRegs();
 
       for (unsigned i = 0; i < numRegs; ++i) {
@@ -76,20 +75,16 @@ namespace {
     }
   public:
     MBlazeBaseAsmLexer(const Target &T, const MCAsmInfo &MAI)
-      : TargetAsmLexer(T), AsmInfo(MAI) {
+      : MCTargetAsmLexer(T), AsmInfo(MAI) {
     }
   };
   
   class MBlazeAsmLexer : public MBlazeBaseAsmLexer {
   public:
-    MBlazeAsmLexer(const Target &T, const MCAsmInfo &MAI)
+    MBlazeAsmLexer(const Target &T, const MCRegisterInfo &MRI,
+                   const MCAsmInfo &MAI)
       : MBlazeBaseAsmLexer(T, MAI) {
-      std::string tripleString("mblaze-unknown-unknown");
-      std::string featureString;
-      std::string CPU;
-      OwningPtr<const TargetMachine> 
-        targetMachine(T.createTargetMachine(tripleString, CPU, featureString));
-      InitRegisterMap(targetMachine->getRegisterInfo());
+      InitRegisterMap(&MRI);
     }
   };
 }
@@ -123,6 +118,6 @@ AsmToken MBlazeBaseAsmLexer::LexTokenUAL() {
 }
 
 extern "C" void LLVMInitializeMBlazeAsmLexer() {
-  RegisterAsmLexer<MBlazeAsmLexer> X(TheMBlazeTarget);
+  RegisterMCAsmLexer<MBlazeAsmLexer> X(TheMBlazeTarget);
 }
 
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index eebd9d8..97d311c 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -7,19 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MBlaze.h"
-#include "MBlazeSubtarget.h"
-#include "MBlazeRegisterInfo.h"
-#include "MBlazeISelLowering.h"
+#include "MCTargetDesc/MBlazeBaseInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -30,7 +27,7 @@ using namespace llvm;
 namespace {
 struct MBlazeOperand;
 
-class MBlazeAsmParser : public TargetAsmParser {
+class MBlazeAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
 
   MCAsmParser &getParser() const { return Parser; }
@@ -64,7 +61,7 @@ class MBlazeAsmParser : public TargetAsmParser {
 
 public:
   MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : TargetAsmParser(), Parser(_Parser) {}
+    : MCTargetAsmParser(), Parser(_Parser) {}
 
   virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
@@ -286,19 +283,19 @@ void MBlazeOperand::print(raw_ostream &OS) const {
     break;
   case Register:
     OS << "<register R";
-    OS << MBlazeRegisterInfo::getRegisterNumbering(getReg()) << ">";
+    OS << getMBlazeRegisterNumbering(getReg()) << ">";
     break;
   case Token:
     OS << "'" << getToken() << "'";
     break;
   case Memory: {
     OS << "<memory R";
-    OS << MBlazeRegisterInfo::getRegisterNumbering(getMemBase());
+    OS << getMBlazeRegisterNumbering(getMemBase());
     OS << ", ";
 
     unsigned RegOff = getMemOffReg();
     if (RegOff)
-      OS << "R" << MBlazeRegisterInfo::getRegisterNumbering(RegOff);
+      OS << "R" << getMBlazeRegisterNumbering(RegOff);
     else
       OS << getMemOff();
     OS << ">";
@@ -326,6 +323,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   unsigned ErrorInfo;
 
   switch (MatchInstructionImpl(Operands, Inst, ErrorInfo)) {
+  default: break;
   case Match_Success:
     Out.EmitInstruction(Inst);
     return false;
@@ -521,7 +519,7 @@ ParseInstruction(StringRef Name, SMLoc NameLoc,
   return false;
 }
 
-/// ParseDirective parses the arm specific directives
+/// ParseDirective parses the MBlaze specific directives
 bool MBlazeAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
@@ -558,7 +556,7 @@ extern "C" void LLVMInitializeMBlazeAsmLexer();
 
 /// Force static initialization.
 extern "C" void LLVMInitializeMBlazeAsmParser() {
-  RegisterAsmParser<MBlazeAsmParser> X(TheMBlazeTarget);
+  RegisterMCAsmParser<MBlazeAsmParser> X(TheMBlazeTarget);
   LLVMInitializeMBlazeAsmLexer();
 }
 
diff --git a/lib/Target/MBlaze/AsmParser/Makefile b/lib/Target/MBlaze/AsmParser/Makefile
index 611a0f4..1e68766 100644
--- a/lib/Target/MBlaze/AsmParser/Makefile
+++ b/lib/Target/MBlaze/AsmParser/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
+##===- lib/Target/MBlaze/AsmParser/Makefile ----------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
index 0bc5b78..47b0db2 100644
--- a/lib/Target/MBlaze/CMakeLists.txt
+++ b/lib/Target/MBlaze/CMakeLists.txt
@@ -1,15 +1,16 @@
 set(LLVM_TARGET_DEFINITIONS MBlaze.td)
 
-tablegen(MBlazeGenRegisterInfo.inc -gen-register-info)
-tablegen(MBlazeGenInstrInfo.inc -gen-instr-info)
-tablegen(MBlazeGenCodeEmitter.inc -gen-emitter)
-tablegen(MBlazeGenAsmWriter.inc -gen-asm-writer)
-tablegen(MBlazeGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(MBlazeGenDAGISel.inc -gen-dag-isel)
-tablegen(MBlazeGenCallingConv.inc -gen-callingconv)
-tablegen(MBlazeGenSubtargetInfo.inc -gen-subtarget)
-tablegen(MBlazeGenIntrinsics.inc -gen-tgt-intrinsic)
-tablegen(MBlazeGenEDInfo.inc -gen-enhanced-disassembly-info)
+llvm_tablegen(MBlazeGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(MBlazeGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(MBlazeGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(MBlazeGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(MBlazeGenAsmMatcher.inc -gen-asm-matcher)
+llvm_tablegen(MBlazeGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(MBlazeGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(MBlazeGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(MBlazeGenIntrinsics.inc -gen-tgt-intrinsic)
+llvm_tablegen(MBlazeGenEDInfo.inc -gen-enhanced-disassembly-info)
+add_public_tablegen_target(MBlazeCommonTableGen)
 
 add_llvm_target(MBlazeCodeGen
   MBlazeDelaySlotFiller.cpp
@@ -24,10 +25,21 @@ add_llvm_target(MBlazeCodeGen
   MBlazeIntrinsicInfo.cpp
   MBlazeSelectionDAGInfo.cpp
   MBlazeAsmPrinter.cpp
-  MBlazeAsmBackend.cpp
   MBlazeMCInstLower.cpp
   MBlazeELFWriterInfo.cpp
-  MBlazeMCCodeEmitter.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMBlazeCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMBlazeAsmPrinter
+  LLVMMBlazeDesc
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
   )
 
 add_subdirectory(AsmParser)
diff --git a/lib/Target/MBlaze/Disassembler/CMakeLists.txt b/lib/Target/MBlaze/Disassembler/CMakeLists.txt
index 9376e68..112c64c 100644
--- a/lib/Target/MBlaze/Disassembler/CMakeLists.txt
+++ b/lib/Target/MBlaze/Disassembler/CMakeLists.txt
@@ -13,4 +13,12 @@ set_property(
   )
 endif()
 
-add_dependencies(LLVMMBlazeDisassembler MBlazeCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMBlazeDisassembler
+  LLVMMBlazeCodeGen
+  LLVMMBlazeDesc
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMBlazeDisassembler MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
index 88d80a1..fd761f1 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
@@ -20,9 +20,9 @@
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 // #include "MBlazeGenDecoderTables.inc"
@@ -60,27 +60,27 @@ static unsigned mblazeBinary2Opcode[] = {
 };
 
 static unsigned getRD(uint32_t insn) {
-  if (!MBlazeRegisterInfo::isRegister((insn>>21)&0x1F))
+  if (!isMBlazeRegister((insn>>21)&0x1F))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>21)&0x1F);
+  return getMBlazeRegisterFromNumbering((insn>>21)&0x1F);
 }
 
 static unsigned getRA(uint32_t insn) {
-  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F))
+  if (!getMBlazeRegisterFromNumbering((insn>>16)&0x1F))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>16)&0x1F);
+  return getMBlazeRegisterFromNumbering((insn>>16)&0x1F);
 }
 
 static unsigned getRB(uint32_t insn) {
-  if (!MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F))
+  if (!getMBlazeRegisterFromNumbering((insn>>11)&0x1F))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getRegisterFromNumbering((insn>>11)&0x1F);
+  return getMBlazeRegisterFromNumbering((insn>>11)&0x1F);
 }
 
 static int64_t getRS(uint32_t insn) {
-  if (!MBlazeRegisterInfo::isSpecialRegister(insn&0x3FFF))
+  if (!isSpecialMBlazeRegister(insn&0x3FFF))
     return UNSUPPORTED;
-  return MBlazeRegisterInfo::getSpecialRegisterFromNumbering(insn&0x3FFF);
+  return getSpecialMBlazeRegisterFromNumbering(insn&0x3FFF);
 }
 
 static int64_t getIMM(uint32_t insn) {
@@ -493,11 +493,12 @@ EDInstInfo *MBlazeDisassembler::getEDInfo() const {
 // Public interface for the disassembler
 //
 
-bool MBlazeDisassembler::getInstruction(MCInst &instr,
+MCDisassembler::DecodeStatus MBlazeDisassembler::getInstruction(MCInst &instr,
                                         uint64_t &size,
                                         const MemoryObject &region,
                                         uint64_t address,
-                                        raw_ostream &vStream) const {
+                                        raw_ostream &vStream,
+                                        raw_ostream &cStream) const {
   // The machine instruction.
   uint32_t insn;
   uint64_t read;
@@ -508,7 +509,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   // We want to read exactly 4 bytes of data.
   if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4)
-    return false;
+    return Fail;
 
   // Encoded as a big-endian 32-bit word in the stream.
   insn = (bytes[0]<<24) | (bytes[1]<<16) | (bytes[2]<< 8) | (bytes[3]<<0);
@@ -517,7 +518,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   // that it is a valid instruction.
   unsigned opcode = getOPCODE(insn);
   if (opcode == UNSUPPORTED)
-    return false;
+    return Fail;
 
   instr.setOpcode(opcode);
 
@@ -529,11 +530,11 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   uint64_t tsFlags = MBlazeInsts[opcode].TSFlags;
   switch ((tsFlags & MBlazeII::FormMask)) {
   default: 
-    return false;
+    return Fail;
 
   case MBlazeII::FRRRR:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RB));
     instr.addOperand(MCOperand::CreateReg(RA));
@@ -541,7 +542,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRR:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateReg(RB));
@@ -550,23 +551,23 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   case MBlazeII::FRI:
     switch (opcode) {
     default: 
-      return false;
+      return Fail;
     case MBlaze::MFS:
       if (RD == UNSUPPORTED)
-        return false;
+        return Fail;
       instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
       break;
     case MBlaze::MTS:
       if (RA == UNSUPPORTED)
-        return false;
+        return Fail;
       instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
       instr.addOperand(MCOperand::CreateReg(RA));
       break;
     case MBlaze::MSRSET:
     case MBlaze::MSRCLR:
       if (RD == UNSUPPORTED)
-        return false;
+        return Fail;
       instr.addOperand(MCOperand::CreateReg(RD));
       instr.addOperand(MCOperand::CreateImm(insn&0x7FFF));
       break;
@@ -575,7 +576,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRI:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     switch (opcode) {
@@ -592,35 +593,35 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FCRR:
     if (RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FCRI:
     if (RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FRCR:
     if (RD == UNSUPPORTED || RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRCI:
     if (RD == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     break;
 
   case MBlazeII::FCCR:
     if (RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
@@ -630,7 +631,7 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRCI:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getSHT(insn)));
@@ -638,35 +639,35 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FRRC:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FRCX:
     if (RD == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
 
   case MBlazeII::FRCS:
     if (RD == UNSUPPORTED || RS == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateReg(RS));
     break;
 
   case MBlazeII::FCRCS:
     if (RS == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RS));
     instr.addOperand(MCOperand::CreateReg(RA));
     break;
 
   case MBlazeII::FCRCX:
     if (RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RA));
     instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
     break;
@@ -677,13 +678,13 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
 
   case MBlazeII::FCR:
     if (RB == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RB));
     break;
 
   case MBlazeII::FRIR:
     if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return false;
+      return Fail;
     instr.addOperand(MCOperand::CreateReg(RD));
     instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
     instr.addOperand(MCOperand::CreateReg(RA));
@@ -693,11 +694,12 @@ bool MBlazeDisassembler::getInstruction(MCInst &instr,
   // We always consume 4 bytes of data on success
   size = 4;
 
-  return true;
+  return Success;
 }
 
-static MCDisassembler *createMBlazeDisassembler(const Target &T) {
-  return new MBlazeDisassembler;
+static MCDisassembler *createMBlazeDisassembler(const Target &T,
+                                                const MCSubtargetInfo &STI) {
+  return new MBlazeDisassembler(STI);
 }
 
 extern "C" void LLVMInitializeMBlazeDisassembler() {
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
index d05eced..0ac0d89 100644
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
+++ b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
@@ -32,19 +32,20 @@ class MBlazeDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MBlazeDisassembler() :
-    MCDisassembler() {
+  MBlazeDisassembler(const MCSubtargetInfo &STI) :
+    MCDisassembler(STI) {
   }
 
   ~MBlazeDisassembler() {
   }
 
   /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
+  MCDisassembler::DecodeStatus getInstruction(MCInst &instr,
                       uint64_t &size,
                       const MemoryObject &region,
                       uint64_t address,
-                      raw_ostream &vStream) const;
+                      raw_ostream &vStream,
+                      raw_ostream &cStream) const;
 
   /// getEDInfo - See MCDisassembler.
   EDInstInfo *getEDInfo() const;
diff --git a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
index 242a573..aff0b3d 100644
--- a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
@@ -2,7 +2,12 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/..
                      ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 
 add_llvm_library(LLVMMBlazeAsmPrinter
-    MBlazeInstPrinter.cpp
+  MBlazeInstPrinter.cpp
   )
 
-add_dependencies(LLVMMBlazeAsmPrinter MBlazeCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMBlazeAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMBlazeAsmPrinter MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
index a7fd287..a1f1dbc 100644
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
+++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
@@ -25,8 +25,10 @@ using namespace llvm;
 // Include the auto-generated portion of the assembly writer.
 #include "MBlazeGenAsmWriter.inc"
 
-void MBlazeInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void MBlazeInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot) {
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void MBlazeInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
index eacca41..570ab08 100644
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
+++ b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
@@ -24,7 +24,7 @@ namespace llvm {
     MBlazeInstPrinter(const MCAsmInfo &MAI)
       : MCInstPrinter(MAI) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O);
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/MBlaze/MBlaze.h b/lib/Target/MBlaze/MBlaze.h
index 3390794..1399b85 100644
--- a/lib/Target/MBlaze/MBlaze.h
+++ b/lib/Target/MBlaze/MBlaze.h
@@ -15,6 +15,7 @@
 #ifndef TARGET_MBLAZE_H
 #define TARGET_MBLAZE_H
 
+#include "MCTargetDesc/MBlazeBaseInfo.h"
 #include "MCTargetDesc/MBlazeMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -22,17 +23,6 @@ namespace llvm {
   class MBlazeTargetMachine;
   class FunctionPass;
   class MachineCodeEmitter;
-  class MCCodeEmitter;
-  class MCInstrInfo;
-  class MCSubtargetInfo;
-  class TargetAsmBackend;
-  class formatted_raw_ostream;
-
-  MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
-                                           const MCSubtargetInfo &STI,
-                                           MCContext &Ctx);
-  
-  TargetAsmBackend *createMBlazeAsmBackend(const Target &, const std::string &);
 
   FunctionPass *createMBlazeISelDag(MBlazeTargetMachine &TM);
   FunctionPass *createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &TM);
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index 0016df5..97bd083 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -38,10 +38,10 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
 
@@ -136,19 +136,17 @@ void MBlazeAsmPrinter::printSavedRegsBitmask() {
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    unsigned RegNum = MBlazeRegisterInfo::getRegisterNumbering(Reg);
+    unsigned RegNum = getMBlazeRegisterNumbering(Reg);
     if (MBlaze::GPRRegisterClass->contains(Reg))
       CPUBitmask |= (1 << RegNum);
   }
 
   // Return Address and Frame registers must also be set in CPUBitmask.
   if (TFI->hasFP(*MF))
-    CPUBitmask |= (1 << MBlazeRegisterInfo::
-                getRegisterNumbering(RI.getFrameRegister(*MF)));
+    CPUBitmask |= (1 <<  getMBlazeRegisterNumbering(RI.getFrameRegister(*MF)));
 
   if (MFI->adjustsStack())
-    CPUBitmask |= (1 << MBlazeRegisterInfo::
-                getRegisterNumbering(RI.getRARegister()));
+    CPUBitmask |= (1 << getMBlazeRegisterNumbering(RI.getRARegister()));
 
   // Print CPUBitmask
   OutStreamer.EmitRawText("\t.mask\t0x" + Twine::utohexstr(CPUBitmask));
@@ -318,18 +316,7 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   return I == Pred->end() || !I->getDesc().isBarrier();
 }
 
-static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
-                                                unsigned SyntaxVariant,
-                                                const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new MBlazeInstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeMBlazeAsmPrinter() {
   RegisterAsmPrinter<MBlazeAsmPrinter> X(TheMBlazeTarget);
-  TargetRegistry::RegisterMCInstPrinter(TheMBlazeTarget,
-                                        createMBlazeMCInstPrinter);
-
 }
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
index e763902..f28d5a7 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
@@ -1,4 +1,4 @@
-//=======- MBlazeFrameLowering.cpp - MBlaze Frame Information ------*- C++ -*-====//
+//===- MBlazeFrameLowering.cpp - MBlaze Frame Information ------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
index 62dfdcc..8ec548f 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeISelLowering.cpp
@@ -59,6 +59,7 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   // MBlaze does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
   addRegisterClass(MVT::i32, MBlaze::GPRRegisterClass);
@@ -187,7 +188,7 @@ MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
   computeRegisterProperties();
 }
 
-MVT::SimpleValueType MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
+EVT MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -964,13 +965,13 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     // The last register argument that must be saved is MBlaze::R10
     TargetRegisterClass *RC = MBlaze::GPRRegisterClass;
 
-    unsigned Begin = MBlazeRegisterInfo::getRegisterNumbering(MBlaze::R5);
-    unsigned Start = MBlazeRegisterInfo::getRegisterNumbering(ArgRegEnd+1);
-    unsigned End   = MBlazeRegisterInfo::getRegisterNumbering(MBlaze::R10);
+    unsigned Begin = getMBlazeRegisterNumbering(MBlaze::R5);
+    unsigned Start = getMBlazeRegisterNumbering(ArgRegEnd+1);
+    unsigned End   = getMBlazeRegisterNumbering(MBlaze::R10);
     unsigned StackLoc = Start - Begin + 1;
 
     for (; Start <= End; ++Start, ++StackLoc) {
-      unsigned Reg = MBlazeRegisterInfo::getRegisterFromNumbering(Start);
+      unsigned Reg = getMBlazeRegisterFromNumbering(Start);
       unsigned LiveReg = MF.addLiveIn(Reg, RC);
       SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, LiveReg, MVT::i32);
 
@@ -1096,7 +1097,7 @@ MBlazeTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
index bb128da..8b49bc3 100644
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ b/lib/Target/MBlaze/MBlazeISelLowering.h
@@ -102,7 +102,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    EVT getSetCCResultType(EVT VT) const;
 
   private:
     // Subtarget Info
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
index 188f10a..7ae05b3 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
@@ -17,9 +17,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 
 #define GET_INSTRINFO_CTOR
@@ -239,7 +239,8 @@ unsigned MBlazeInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool MBlazeInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+bool MBlazeInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand>
+                                               &Cond) const {
   assert(Cond.size() == 2 && "Invalid MBlaze branch opcode!");
   switch (Cond[0].getImm()) {
   default:            return true;
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
index 79f962b..7174405 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.h
@@ -166,62 +166,6 @@ namespace MBlaze {
   }
 }
 
-/// MBlazeII - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace MBlazeII {
-  enum {
-    // PseudoFrm - This represents an instruction that is a pseudo instruction
-    // or one that has not been implemented yet.  It is illegal to code generate
-    // it, but tolerated for intermediate implementation stages.
-    FPseudo = 0,
-    FRRR,
-    FRRI,
-    FCRR,
-    FCRI,
-    FRCR,
-    FRCI,
-    FCCR,
-    FCCI,
-    FRRCI,
-    FRRC,
-    FRCX,
-    FRCS,
-    FCRCS,
-    FCRCX,
-    FCX,
-    FCR,
-    FRIR,
-    FRRRR,
-    FRI,
-    FC,
-    FormMask = 63
-
-    //===------------------------------------------------------------------===//
-    // MBlaze Specific MachineOperand flags.
-    // MO_NO_FLAG,
-
-    /// MO_GOT - Represents the offset into the global offset table at which
-    /// the address the relocation entry symbol resides during execution.
-    // MO_GOT,
-
-    /// MO_GOT_CALL - Represents the offset into the global offset table at
-    /// which the address of a call site relocation entry symbol resides
-    /// during execution. This is different from the above since this flag
-    /// can only be present in call instructions.
-    // MO_GOT_CALL,
-
-    /// MO_GPREL - Represents the offset from the current gp value to be used
-    /// for the relocatable object file being produced.
-    // MO_GPREL,
-
-    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
-    /// address.
-    // MO_ABS_HILO
-
-  };
-}
-
 class MBlazeInstrInfo : public MBlazeGenInstrInfo {
   MBlazeTargetMachine &TM;
   const MBlazeRegisterInfo RI;
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
index 950f2d7..1d8c987 100644
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ b/lib/Target/MBlaze/MBlazeInstrInfo.td
@@ -442,17 +442,19 @@ let Predicates=[HasMul] in {
 //===----------------------------------------------------------------------===//
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def LBU  :  LoadM<0x30, 0x000, "lbu    ">;
-  def LBUR :  LoadM<0x30, 0x200, "lbur   ">;
+  let neverHasSideEffects = 1 in {
+    def LBU  :  LoadM<0x30, 0x000, "lbu    ">;
+    def LBUR :  LoadM<0x30, 0x200, "lbur   ">;
 
-  def LHU  :  LoadM<0x31, 0x000, "lhu    ">;
-  def LHUR :  LoadM<0x31, 0x200, "lhur   ">;
+    def LHU  :  LoadM<0x31, 0x000, "lhu    ">;
+    def LHUR :  LoadM<0x31, 0x200, "lhur   ">;
 
-  def LW   :  LoadM<0x32, 0x000, "lw     ">;
-  def LWR  :  LoadM<0x32, 0x200, "lwr    ">;
+    def LW   :  LoadM<0x32, 0x000, "lw     ">;
+    def LWR  :  LoadM<0x32, 0x200, "lwr    ">;
 
-  let Defs = [CARRY] in {
-    def LWX  :  LoadM<0x32, 0x400, "lwx    ">;
+    let Defs = [CARRY] in {
+      def LWX  :  LoadM<0x32, 0x400, "lwx    ">;
+    }
   }
 
   def LBUI : LoadMI<0x38, "lbui   ", zextloadi8>;
@@ -877,6 +879,9 @@ def : Pat<(zextloadi8 xaddr:$addr), (i32 (LBU xaddr:$addr))>;
 // Peepholes
 def : Pat<(store (i32 0), iaddr:$dst), (SWI (i32 R0), iaddr:$dst)>;
 
+// Atomic fence
+def : Pat<(atomic_fence (imm), (imm)), (MEMBARRIER)>;
+
 //===----------------------------------------------------------------------===//
 // Floating Point Support
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
index 32d67b2..ea81dd6 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
@@ -37,7 +37,7 @@ namespace mblazeIntrinsic {
 #undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
 }
 
-std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, const Type **Tys,
+std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
                                          unsigned numTys) const {
   static const char *const names[] = {
 #define GET_INTRINSIC_NAME_TABLE
@@ -90,8 +90,8 @@ bool MBlazeIntrinsicInfo::isOverloaded(unsigned IntrID) const {
 #include "MBlazeGenIntrinsics.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
 
-static const FunctionType *getType(LLVMContext &Context, unsigned id) {
-  const Type *ResultTy = NULL;
+static FunctionType *getType(LLVMContext &Context, unsigned id) {
+  Type *ResultTy = NULL;
   std::vector<Type*> ArgTys;
   bool IsVarArg = false;
 
@@ -103,7 +103,7 @@ static const FunctionType *getType(LLVMContext &Context, unsigned id) {
 }
 
 Function *MBlazeIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-                                                const Type **Tys,
+                                                Type **Tys,
                                                 unsigned numTy) const {
   assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
   AttrListPtr AList = getAttributes((mblazeIntrinsic::ID) IntrID);
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.h b/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
index 9804c77..80760d8 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
@@ -19,12 +19,12 @@ namespace llvm {
 
   class MBlazeIntrinsicInfo : public TargetIntrinsicInfo {
   public:
-    std::string getName(unsigned IntrID, const Type **Tys = 0,
+    std::string getName(unsigned IntrID, Type **Tys = 0,
                         unsigned numTys = 0) const;
     unsigned lookupName(const char *Name, unsigned Len) const;
     unsigned lookupGCCName(const char *Name) const;
     bool isOverloaded(unsigned IID) const;
-    Function *getDeclaration(Module *M, unsigned ID, const Type **Tys = 0,
+    Function *getDeclaration(Module *M, unsigned ID, Type **Tys = 0,
                              unsigned numTys = 0) const;
   };
 
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index f0b201a..9788ba9 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -44,164 +43,7 @@ using namespace llvm;
 
 MBlazeRegisterInfo::
 MBlazeRegisterInfo(const MBlazeSubtarget &ST, const TargetInstrInfo &tii)
-  : MBlazeGenRegisterInfo(), Subtarget(ST), TII(tii) {}
-
-/// getRegisterNumbering - Given the enum value for some register, e.g.
-/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
-unsigned MBlazeRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
-  switch (RegEnum) {
-    case MBlaze::R0     : return 0;
-    case MBlaze::R1     : return 1;
-    case MBlaze::R2     : return 2;
-    case MBlaze::R3     : return 3;
-    case MBlaze::R4     : return 4;
-    case MBlaze::R5     : return 5;
-    case MBlaze::R6     : return 6;
-    case MBlaze::R7     : return 7;
-    case MBlaze::R8     : return 8;
-    case MBlaze::R9     : return 9;
-    case MBlaze::R10    : return 10;
-    case MBlaze::R11    : return 11;
-    case MBlaze::R12    : return 12;
-    case MBlaze::R13    : return 13;
-    case MBlaze::R14    : return 14;
-    case MBlaze::R15    : return 15;
-    case MBlaze::R16    : return 16;
-    case MBlaze::R17    : return 17;
-    case MBlaze::R18    : return 18;
-    case MBlaze::R19    : return 19;
-    case MBlaze::R20    : return 20;
-    case MBlaze::R21    : return 21;
-    case MBlaze::R22    : return 22;
-    case MBlaze::R23    : return 23;
-    case MBlaze::R24    : return 24;
-    case MBlaze::R25    : return 25;
-    case MBlaze::R26    : return 26;
-    case MBlaze::R27    : return 27;
-    case MBlaze::R28    : return 28;
-    case MBlaze::R29    : return 29;
-    case MBlaze::R30    : return 30;
-    case MBlaze::R31    : return 31;
-    case MBlaze::RPC    : return 0x0000;
-    case MBlaze::RMSR   : return 0x0001;
-    case MBlaze::REAR   : return 0x0003;
-    case MBlaze::RESR   : return 0x0005;
-    case MBlaze::RFSR   : return 0x0007;
-    case MBlaze::RBTR   : return 0x000B;
-    case MBlaze::REDR   : return 0x000D;
-    case MBlaze::RPID   : return 0x1000;
-    case MBlaze::RZPR   : return 0x1001;
-    case MBlaze::RTLBX  : return 0x1002;
-    case MBlaze::RTLBLO : return 0x1003;
-    case MBlaze::RTLBHI : return 0x1004;
-    case MBlaze::RPVR0  : return 0x2000;
-    case MBlaze::RPVR1  : return 0x2001;
-    case MBlaze::RPVR2  : return 0x2002;
-    case MBlaze::RPVR3  : return 0x2003;
-    case MBlaze::RPVR4  : return 0x2004;
-    case MBlaze::RPVR5  : return 0x2005;
-    case MBlaze::RPVR6  : return 0x2006;
-    case MBlaze::RPVR7  : return 0x2007;
-    case MBlaze::RPVR8  : return 0x2008;
-    case MBlaze::RPVR9  : return 0x2009;
-    case MBlaze::RPVR10 : return 0x200A;
-    case MBlaze::RPVR11 : return 0x200B;
-    default: llvm_unreachable("Unknown register number!");
-  }
-  return 0; // Not reached
-}
-
-/// getRegisterFromNumbering - Given the enum value for some register, e.g.
-/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
-unsigned MBlazeRegisterInfo::getRegisterFromNumbering(unsigned Reg) {
-  switch (Reg) {
-    case 0  : return MBlaze::R0;
-    case 1  : return MBlaze::R1;
-    case 2  : return MBlaze::R2;
-    case 3  : return MBlaze::R3;
-    case 4  : return MBlaze::R4;
-    case 5  : return MBlaze::R5;
-    case 6  : return MBlaze::R6;
-    case 7  : return MBlaze::R7;
-    case 8  : return MBlaze::R8;
-    case 9  : return MBlaze::R9;
-    case 10 : return MBlaze::R10;
-    case 11 : return MBlaze::R11;
-    case 12 : return MBlaze::R12;
-    case 13 : return MBlaze::R13;
-    case 14 : return MBlaze::R14;
-    case 15 : return MBlaze::R15;
-    case 16 : return MBlaze::R16;
-    case 17 : return MBlaze::R17;
-    case 18 : return MBlaze::R18;
-    case 19 : return MBlaze::R19;
-    case 20 : return MBlaze::R20;
-    case 21 : return MBlaze::R21;
-    case 22 : return MBlaze::R22;
-    case 23 : return MBlaze::R23;
-    case 24 : return MBlaze::R24;
-    case 25 : return MBlaze::R25;
-    case 26 : return MBlaze::R26;
-    case 27 : return MBlaze::R27;
-    case 28 : return MBlaze::R28;
-    case 29 : return MBlaze::R29;
-    case 30 : return MBlaze::R30;
-    case 31 : return MBlaze::R31;
-    default: llvm_unreachable("Unknown register number!");
-  }
-  return 0; // Not reached
-}
-
-unsigned MBlazeRegisterInfo::getSpecialRegisterFromNumbering(unsigned Reg) {
-  switch (Reg) {
-    case 0x0000 : return MBlaze::RPC;
-    case 0x0001 : return MBlaze::RMSR;
-    case 0x0003 : return MBlaze::REAR;
-    case 0x0005 : return MBlaze::RESR;
-    case 0x0007 : return MBlaze::RFSR;
-    case 0x000B : return MBlaze::RBTR;
-    case 0x000D : return MBlaze::REDR;
-    case 0x1000 : return MBlaze::RPID;
-    case 0x1001 : return MBlaze::RZPR;
-    case 0x1002 : return MBlaze::RTLBX;
-    case 0x1003 : return MBlaze::RTLBLO;
-    case 0x1004 : return MBlaze::RTLBHI;
-    case 0x2000 : return MBlaze::RPVR0;
-    case 0x2001 : return MBlaze::RPVR1;
-    case 0x2002 : return MBlaze::RPVR2;
-    case 0x2003 : return MBlaze::RPVR3;
-    case 0x2004 : return MBlaze::RPVR4;
-    case 0x2005 : return MBlaze::RPVR5;
-    case 0x2006 : return MBlaze::RPVR6;
-    case 0x2007 : return MBlaze::RPVR7;
-    case 0x2008 : return MBlaze::RPVR8;
-    case 0x2009 : return MBlaze::RPVR9;
-    case 0x200A : return MBlaze::RPVR10;
-    case 0x200B : return MBlaze::RPVR11;
-    default: llvm_unreachable("Unknown register number!");
-  }
-  return 0; // Not reached
-}
-
-bool MBlazeRegisterInfo::isRegister(unsigned Reg) {
-  return Reg <= 31;
-}
-
-bool MBlazeRegisterInfo::isSpecialRegister(unsigned Reg) {
-  switch (Reg) {
-    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
-    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
-    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
-    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
-    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
-    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
-      return true;
-
-    default:
-      return false;
-  }
-  return false; // Not reached
-}
+  : MBlazeGenRegisterInfo(MBlaze::R15), Subtarget(ST), TII(tii) {}
 
 unsigned MBlazeRegisterInfo::getPICCallReg() {
   return MBlaze::R20;
@@ -334,10 +176,6 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
     MFI->setObjectOffset(MBlazeFI->getGPFI(), MBlazeFI->getGPStackOffset());
 }
 
-unsigned MBlazeRegisterInfo::getRARegister() const {
-  return MBlaze::R15;
-}
-
 unsigned MBlazeRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
@@ -353,11 +191,3 @@ unsigned MBlazeRegisterInfo::getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int MBlazeRegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
-  return MBlazeGenRegisterInfo::getDwarfRegNumFull(RegNo,0);
-}
-
-int MBlazeRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return MBlazeGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
index 7ebce21..7e4b269 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.h
@@ -42,14 +42,6 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   MBlazeRegisterInfo(const MBlazeSubtarget &Subtarget,
                      const TargetInstrInfo &tii);
 
-  /// getRegisterNumbering - Given the enum value for some register, e.g.
-  /// MBlaze::RA, return the number that it corresponds to (e.g. 31).
-  static unsigned getRegisterNumbering(unsigned RegEnum);
-  static unsigned getRegisterFromNumbering(unsigned RegEnum);
-  static unsigned getSpecialRegisterFromNumbering(unsigned RegEnum);
-  static bool isRegister(unsigned RegEnum);
-  static bool isSpecialRegister(unsigned RegEnum);
-
   /// Get PIC indirect call register
   static unsigned getPICCallReg();
 
@@ -69,15 +61,11 @@ struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   /// Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   /// Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
index eda141d..7e5667f 100644
--- a/lib/Target/MBlaze/MBlazeSubtarget.cpp
+++ b/lib/Target/MBlaze/MBlazeSubtarget.cpp
@@ -15,7 +15,7 @@
 #include "MBlaze.h"
 #include "MBlazeRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index 7208874..7bff53e 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -16,48 +16,13 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 using namespace llvm;
 
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin()) {
-    llvm_unreachable("MBlaze does not support Darwin MACH-O format");
-    return NULL;
-  }
-
-  if (TheTriple.isOSWindows()) {
-    llvm_unreachable("MBlaze does not support Windows COFF format");
-    return NULL;
-  }
-
-  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
-}
-
-
 extern "C" void LLVMInitializeMBlazeTarget() {
   // Register the target.
   RegisterTargetMachine<MBlazeTargetMachine> X(TheMBlazeTarget);
-
-  // Register the MC code emitter
-  TargetRegistry::RegisterCodeEmitter(TheMBlazeTarget,
-                                      llvm::createMBlazeMCCodeEmitter);
-
-  // Register the asm backend
-  TargetRegistry::RegisterAsmBackend(TheMBlazeTarget,
-                                     createMBlazeAsmBackend);
-
-  // Register the object streamer
-  TargetRegistry::RegisterObjectStreamer(TheMBlazeTarget,
-                                         createMCStreamer);
-
 }
 
 // DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
@@ -67,21 +32,16 @@ extern "C" void LLVMInitializeMBlazeTarget() {
 // offset from the stack/frame pointer, using StackGrowsUp enables
 // an easier handling.
 MBlazeTargetMachine::
-MBlazeTargetMachine(const Target &T, const std::string &TT,
-                    const std::string &CPU, const std::string &FS):
-  LLVMTargetMachine(T, TT, CPU, FS),
+MBlazeTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS,
+                    Reloc::Model RM, CodeModel::Model CM):
+  LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
   Subtarget(TT, CPU, FS),
   DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
   InstrInfo(*this),
   FrameLowering(Subtarget),
   TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
   InstrItins(Subtarget.getInstrItineraryData()) {
-  if (getRelocationModel() == Reloc::Default) {
-      setRelocationModel(Reloc::Static);
-  }
-
-  if (getCodeModel() == CodeModel::Default)
-    setCodeModel(CodeModel::Small);
 }
 
 // Install an instruction selector pass using
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
index cd6caaf..c1bc08a 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -41,8 +41,9 @@ namespace llvm {
     InstrItineraryData     InstrItins;
 
   public:
-    MBlazeTargetMachine(const Target &T, const std::string &TT,
-                        const std::string &CPU, const std::string &FS);
+    MBlazeTargetMachine(const Target &T, StringRef TT,
+                        StringRef CPU, StringRef FS,
+                        Reloc::Model RM, CodeModel::Model CM);
 
     virtual const MBlazeInstrInfo *getInstrInfo() const
     { return &InstrInfo; }
diff --git a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
index abd1b0b..f66ea30 100644
--- a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
@@ -69,7 +69,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
   if (Kind.isMergeable1ByteCString())
     return false;
 
-  const Type *Ty = GV->getType()->getElementType();
+  Type *Ty = GV->getType()->getElementType();
   return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
 }
 
diff --git a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt b/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
index 3d15708..37871b6 100644
--- a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,15 @@
 add_llvm_library(LLVMMBlazeDesc
-  MBlazeMCTargetDesc.cpp
+  MBlazeAsmBackend.cpp
   MBlazeMCAsmInfo.cpp
+  MBlazeMCCodeEmitter.cpp
+  MBlazeMCTargetDesc.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMBlazeDesc
+  LLVMMBlazeAsmPrinter
+  LLVMMBlazeInfo
+  LLVMMC
+  LLVMSupport
   )
+
+add_dependencies(LLVMMBlazeDesc MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
index 08f14c3..08f7d46a 100644
--- a/lib/Target/MBlaze/MBlazeAsmBackend.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
@@ -7,10 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmBackend.h"
-#include "MBlaze.h"
-#include "MBlazeELFWriterInfo.h"
-#include "llvm/ADT/Twine.h"
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -20,11 +18,11 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
 static unsigned getFixupKindSize(unsigned Kind) {
@@ -48,10 +46,10 @@ public:
                               /*HasRelocationAddend*/ true) {}
 };
 
-class MBlazeAsmBackend : public TargetAsmBackend {
+class MBlazeAsmBackend : public MCAsmBackend {
 public:
   MBlazeAsmBackend(const Target &T)
-    : TargetAsmBackend() {
+    : MCAsmBackend() {
   }
 
   unsigned getNumFixupKinds() const {
@@ -148,8 +146,7 @@ void ELFMBlazeAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data,
 }
 } // end anonymous namespace
 
-TargetAsmBackend *llvm::createMBlazeAsmBackend(const Target &T,
-                                            const std::string &TT) {
+MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
new file mode 100644
index 0000000..776dbc4
--- /dev/null
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
@@ -0,0 +1,240 @@
+//===-- MBlazeBaseInfo.h - Top level definitions for MBlaze -- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the MBlaze target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MBlazeBASEINFO_H
+#define MBlazeBASEINFO_H
+
+#include "MBlazeMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// MBlazeII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MBlazeII {
+  enum {
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    FPseudo = 0,
+    FRRR,
+    FRRI,
+    FCRR,
+    FCRI,
+    FRCR,
+    FRCI,
+    FCCR,
+    FCCI,
+    FRRCI,
+    FRRC,
+    FRCX,
+    FRCS,
+    FCRCS,
+    FCRCX,
+    FCX,
+    FCR,
+    FRIR,
+    FRRRR,
+    FRI,
+    FC,
+    FormMask = 63
+
+    //===------------------------------------------------------------------===//
+    // MBlaze Specific MachineOperand flags.
+    // MO_NO_FLAG,
+
+    /// MO_GOT - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    // MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at
+    /// which the address of a call site relocation entry symbol resides
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    // MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used
+    /// for the relocatable object file being produced.
+    // MO_GPREL,
+
+    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
+    /// address.
+    // MO_ABS_HILO
+
+  };
+}
+
+static inline bool isMBlazeRegister(unsigned Reg) {
+  return Reg <= 31;
+}
+
+static inline bool isSpecialMBlazeRegister(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
+    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
+    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
+    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
+    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
+    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
+      return true;
+
+    default:
+      return false;
+  }
+  return false; // Not reached
+}
+
+/// getMBlazeRegisterNumbering - Given the enum value for some register, e.g.
+/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
+static inline unsigned getMBlazeRegisterNumbering(unsigned RegEnum) {
+  switch (RegEnum) {
+    case MBlaze::R0     : return 0;
+    case MBlaze::R1     : return 1;
+    case MBlaze::R2     : return 2;
+    case MBlaze::R3     : return 3;
+    case MBlaze::R4     : return 4;
+    case MBlaze::R5     : return 5;
+    case MBlaze::R6     : return 6;
+    case MBlaze::R7     : return 7;
+    case MBlaze::R8     : return 8;
+    case MBlaze::R9     : return 9;
+    case MBlaze::R10    : return 10;
+    case MBlaze::R11    : return 11;
+    case MBlaze::R12    : return 12;
+    case MBlaze::R13    : return 13;
+    case MBlaze::R14    : return 14;
+    case MBlaze::R15    : return 15;
+    case MBlaze::R16    : return 16;
+    case MBlaze::R17    : return 17;
+    case MBlaze::R18    : return 18;
+    case MBlaze::R19    : return 19;
+    case MBlaze::R20    : return 20;
+    case MBlaze::R21    : return 21;
+    case MBlaze::R22    : return 22;
+    case MBlaze::R23    : return 23;
+    case MBlaze::R24    : return 24;
+    case MBlaze::R25    : return 25;
+    case MBlaze::R26    : return 26;
+    case MBlaze::R27    : return 27;
+    case MBlaze::R28    : return 28;
+    case MBlaze::R29    : return 29;
+    case MBlaze::R30    : return 30;
+    case MBlaze::R31    : return 31;
+    case MBlaze::RPC    : return 0x0000;
+    case MBlaze::RMSR   : return 0x0001;
+    case MBlaze::REAR   : return 0x0003;
+    case MBlaze::RESR   : return 0x0005;
+    case MBlaze::RFSR   : return 0x0007;
+    case MBlaze::RBTR   : return 0x000B;
+    case MBlaze::REDR   : return 0x000D;
+    case MBlaze::RPID   : return 0x1000;
+    case MBlaze::RZPR   : return 0x1001;
+    case MBlaze::RTLBX  : return 0x1002;
+    case MBlaze::RTLBLO : return 0x1003;
+    case MBlaze::RTLBHI : return 0x1004;
+    case MBlaze::RPVR0  : return 0x2000;
+    case MBlaze::RPVR1  : return 0x2001;
+    case MBlaze::RPVR2  : return 0x2002;
+    case MBlaze::RPVR3  : return 0x2003;
+    case MBlaze::RPVR4  : return 0x2004;
+    case MBlaze::RPVR5  : return 0x2005;
+    case MBlaze::RPVR6  : return 0x2006;
+    case MBlaze::RPVR7  : return 0x2007;
+    case MBlaze::RPVR8  : return 0x2008;
+    case MBlaze::RPVR9  : return 0x2009;
+    case MBlaze::RPVR10 : return 0x200A;
+    case MBlaze::RPVR11 : return 0x200B;
+    default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+
+/// getRegisterFromNumbering - Given the enum value for some register, e.g.
+/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
+static inline unsigned getMBlazeRegisterFromNumbering(unsigned Reg) {
+  switch (Reg) {
+    case 0  : return MBlaze::R0;
+    case 1  : return MBlaze::R1;
+    case 2  : return MBlaze::R2;
+    case 3  : return MBlaze::R3;
+    case 4  : return MBlaze::R4;
+    case 5  : return MBlaze::R5;
+    case 6  : return MBlaze::R6;
+    case 7  : return MBlaze::R7;
+    case 8  : return MBlaze::R8;
+    case 9  : return MBlaze::R9;
+    case 10 : return MBlaze::R10;
+    case 11 : return MBlaze::R11;
+    case 12 : return MBlaze::R12;
+    case 13 : return MBlaze::R13;
+    case 14 : return MBlaze::R14;
+    case 15 : return MBlaze::R15;
+    case 16 : return MBlaze::R16;
+    case 17 : return MBlaze::R17;
+    case 18 : return MBlaze::R18;
+    case 19 : return MBlaze::R19;
+    case 20 : return MBlaze::R20;
+    case 21 : return MBlaze::R21;
+    case 22 : return MBlaze::R22;
+    case 23 : return MBlaze::R23;
+    case 24 : return MBlaze::R24;
+    case 25 : return MBlaze::R25;
+    case 26 : return MBlaze::R26;
+    case 27 : return MBlaze::R27;
+    case 28 : return MBlaze::R28;
+    case 29 : return MBlaze::R29;
+    case 30 : return MBlaze::R30;
+    case 31 : return MBlaze::R31;
+    default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+
+static inline unsigned getSpecialMBlazeRegisterFromNumbering(unsigned Reg) {
+  switch (Reg) {
+    case 0x0000 : return MBlaze::RPC;
+    case 0x0001 : return MBlaze::RMSR;
+    case 0x0003 : return MBlaze::REAR;
+    case 0x0005 : return MBlaze::RESR;
+    case 0x0007 : return MBlaze::RFSR;
+    case 0x000B : return MBlaze::RBTR;
+    case 0x000D : return MBlaze::REDR;
+    case 0x1000 : return MBlaze::RPID;
+    case 0x1001 : return MBlaze::RZPR;
+    case 0x1002 : return MBlaze::RTLBX;
+    case 0x1003 : return MBlaze::RTLBLO;
+    case 0x1004 : return MBlaze::RTLBHI;
+    case 0x2000 : return MBlaze::RPVR0;
+    case 0x2001 : return MBlaze::RPVR1;
+    case 0x2002 : return MBlaze::RPVR2;
+    case 0x2003 : return MBlaze::RPVR3;
+    case 0x2004 : return MBlaze::RPVR4;
+    case 0x2005 : return MBlaze::RPVR5;
+    case 0x2006 : return MBlaze::RPVR6;
+    case 0x2007 : return MBlaze::RPVR7;
+    case 0x2008 : return MBlaze::RPVR8;
+    case 0x2009 : return MBlaze::RPVR9;
+    case 0x200A : return MBlaze::RPVR10;
+    case 0x200B : return MBlaze::RPVR11;
+    default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/MBlaze/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
index ddc636d..1514557 100644
--- a/lib/Target/MBlaze/MBlazeMCCodeEmitter.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
@@ -12,11 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mccodeemitter"
-#include "MBlaze.h"
-#include "MBlazeInstrInfo.h"
+#include "MCTargetDesc/MBlazeBaseInfo.h"
+#include "MCTargetDesc/MBlazeMCTargetDesc.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/ADT/Statistic.h"
@@ -106,7 +108,7 @@ MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
 unsigned MBlazeMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                              const MCOperand &MO) const {
   if (MO.isReg())
-    return MBlazeRegisterInfo::getRegisterNumbering(MO.getReg());
+    return getMBlazeRegisterNumbering(MO.getReg());
   else if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
   else if (MO.isExpr())
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
index 20d6c0b..43ae281 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
@@ -13,10 +13,14 @@
 
 #include "MBlazeMCTargetDesc.h"
 #include "MBlazeMCAsmInfo.h"
+#include "InstPrinter/MBlazeInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "MBlazeGenInstrInfo.inc"
@@ -36,8 +40,10 @@ static MCInstrInfo *createMBlazeMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeMBlazeMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheMBlazeTarget, createMBlazeMCInstrInfo);
+static MCRegisterInfo *createMBlazeMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMBlazeMCRegisterInfo(X, MBlaze::R15);
+  return X;
 }
 
 static MCSubtargetInfo *createMBlazeMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -47,11 +53,6 @@ static MCSubtargetInfo *createMBlazeMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeMBlazeMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheMBlazeTarget,
-                                          createMBlazeMCSubtargetInfo);
-}
-
 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
   switch (TheTriple.getOS()) {
@@ -60,6 +61,80 @@ static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
   }
 }
 
-extern "C" void LLVMInitializeMBlazeMCAsmInfo() {
+static MCCodeGenInfo *createMBlazeMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default)
+    RM = Reloc::Static;
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin()) {
+    llvm_unreachable("MBlaze does not support Darwin MACH-O format");
+    return NULL;
+  }
+
+  if (TheTriple.isOSWindows()) {
+    llvm_unreachable("MBlaze does not support Windows COFF format");
+    return NULL;
+  }
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new MBlazeInstPrinter(MAI);
+  return 0;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMBlazeTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfoFn X(TheMBlazeTarget, createMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMBlazeTarget,
+                                        createMBlazeMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMBlazeTarget, createMBlazeMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMBlazeTarget,
+                                    createMBlazeMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMBlazeTarget,
+                                          createMBlazeMCSubtargetInfo);
+
+  // Register the MC code emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheMBlazeTarget,
+                                        llvm::createMBlazeMCCodeEmitter);
+
+  // Register the asm backend
+  TargetRegistry::RegisterMCAsmBackend(TheMBlazeTarget,
+                                       createMBlazeAsmBackend);
+
+  // Register the object streamer
+  TargetRegistry::RegisterMCObjectStreamer(TheMBlazeTarget,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMBlazeTarget,
+                                        createMBlazeMCInstPrinter);
 }
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
index b14772e..deff5cb 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -15,12 +15,23 @@
 #define MBLAZEMCTARGETDESC_H
 
 namespace llvm {
+class MCAsmBackend;
+class MCContext;
+class MCCodeEmitter;
+class MCInstrInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
+class formatted_raw_ostream;
 
 extern Target TheMBlazeTarget;
 
+MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx);
+  
+MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT);
+
 } // End llvm namespace
 
 // Defines symbolic names for MBlaze registers.  This defines a mapping from
diff --git a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
index 40696f6..93fce58 100644
--- a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
+++ b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
@@ -5,4 +5,10 @@ add_llvm_library(LLVMMBlazeInfo
   MBlazeTargetInfo.cpp
   )
 
-add_dependencies(LLVMMBlazeInfo MBlazeCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMBlazeInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMMBlazeInfo MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp b/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
index 16e01db..71210d8 100644
--- a/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
+++ b/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "MBlaze.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheMBlazeTarget;
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index 33f3d44..0952b76 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS MSP430.td)
 
-tablegen(MSP430GenRegisterInfo.inc -gen-register-info)
-tablegen(MSP430GenInstrInfo.inc -gen-instr-info)
-tablegen(MSP430GenAsmWriter.inc -gen-asm-writer)
-tablegen(MSP430GenDAGISel.inc -gen-dag-isel)
-tablegen(MSP430GenCallingConv.inc -gen-callingconv)
-tablegen(MSP430GenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(MSP430GenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(MSP430GenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(MSP430GenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(MSP430GenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(MSP430GenCallingConv.inc -gen-callingconv)
+llvm_tablegen(MSP430GenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(MSP430CommonTableGen)
 
 add_llvm_target(MSP430CodeGen
   MSP430BranchSelector.cpp
@@ -21,6 +22,19 @@ add_llvm_target(MSP430CodeGen
   MSP430MCInstLower.cpp
   )
 
+add_llvm_library_dependencies(LLVMMSP430CodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMMSP430AsmPrinter
+  LLVMMSP430Desc
+  LLVMMSP430Info
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/MSP430/InstPrinter/CMakeLists.txt b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
index f5458d5..ce39d95 100644
--- a/lib/Target/MSP430/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMMSP430AsmPrinter
   MSP430InstPrinter.cpp
   )
-add_dependencies(LLVMMSP430AsmPrinter MSP430CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMMSP430AsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMSP430AsmPrinter MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
index e10d4fe..5d6c6ad 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -25,8 +25,10 @@ using namespace llvm;
 // Include the auto-generated portion of the assembly writer.
 #include "MSP430GenAsmWriter.inc"
 
-void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void MSP430InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                  StringRef Annot) {
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index 50d98b7..a1984a8 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -22,9 +22,9 @@ namespace llvm {
   class MSP430InstPrinter : public MCInstPrinter {
   public:
     MSP430InstPrinter(const MCAsmInfo &MAI)
-      : MCInstPrinter(MAI) {}
+        : MCInstPrinter(MAI) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O);
+    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
index 0f3ebd3..04bd03e 100644
--- a/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/MSP430/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,11 @@ add_llvm_library(LLVMMSP430Desc
   MSP430MCTargetDesc.cpp
   MSP430MCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMMSP430Desc
+  LLVMMC
+  LLVMMSP430AsmPrinter
+  LLVMMSP430Info
+  )
+
+add_dependencies(LLVMMSP430Desc MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 43a704d..fda70b8 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -13,10 +13,12 @@
 
 #include "MSP430MCTargetDesc.h"
 #include "MSP430MCAsmInfo.h"
+#include "InstPrinter/MSP430InstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "MSP430GenInstrInfo.inc"
@@ -29,18 +31,18 @@
 
 using namespace llvm;
 
-
 static MCInstrInfo *createMSP430MCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitMSP430MCInstrInfo(X);
   return X;
 }
 
-extern "C" void LLVMInitializeMSP430MCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheMSP430Target, createMSP430MCInstrInfo);
+static MCRegisterInfo *createMSP430MCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMSP430MCRegisterInfo(X, MSP430::PCW);
+  return X;
 }
 
-
 static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU,
                                                     StringRef FS) {
   MCSubtargetInfo *X = new MCSubtargetInfo();
@@ -48,11 +50,42 @@ static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeMSP430MCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheMSP430Target,
-                                          createMSP430MCSubtargetInfo);
+static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new MSP430InstPrinter(MAI);
+  return 0;
 }
 
-extern "C" void LLVMInitializeMSP430MCAsmInfo() {
+extern "C" void LLVMInitializeMSP430TargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<MSP430MCAsmInfo> X(TheMSP430Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMSP430Target,
+                                        createMSP430MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMSP430Target, createMSP430MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMSP430Target,
+                                    createMSP430MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheMSP430Target,
+                                          createMSP430MCSubtargetInfo);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMSP430Target,
+                                        createMSP430MCInstPrinter);
 }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
index 0d8a6bd..35f2590 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.h
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ALPHAMCTARGETDESC_H
-#define ALPHAMCTARGETDESC_H
+#ifndef MSP430MCTARGETDESC_H
+#define MSP430MCTARGETDESC_H
 
 namespace llvm {
 class MCSubtargetInfo;
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 2042056..8836549 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -32,9 +32,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -163,17 +161,7 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   OutStreamer.EmitInstruction(TmpInst);
 }
 
-static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
-                                                unsigned SyntaxVariant,
-                                                const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new MSP430InstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeMSP430AsmPrinter() {
   RegisterAsmPrinter<MSP430AsmPrinter> X(TheMSP430Target);
-  TargetRegistry::RegisterMCInstPrinter(TheMSP430Target,
-                                        createMSP430MCInstPrinter);
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 0a3eab1..dc37431 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -79,6 +79,7 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
 
   setStackPointerRegisterToSaveRestore(MSP430::SPW);
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
   setSchedulingPreference(Sched::Latency);
 
   // We have post-incremented loads / stores.
@@ -987,8 +988,8 @@ const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-bool MSP430TargetLowering::isTruncateFree(const Type *Ty1,
-                                          const Type *Ty2) const {
+bool MSP430TargetLowering::isTruncateFree(Type *Ty1,
+                                          Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
 
@@ -1002,7 +1003,7 @@ bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   return (VT1.getSizeInBits() > VT2.getSizeInBits());
 }
 
-bool MSP430TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
   return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index bd660a0..237f604 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -102,7 +102,7 @@ namespace llvm {
     /// isTruncateFree - Return true if it's free to truncate a value of type
     /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
     /// register R15W to i8 by referencing its sub-register R15B.
-    virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
     virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
 
     /// isZExtFree - Return true if any actual instruction that defines a value
@@ -113,7 +113,7 @@ namespace llvm {
     /// necessarily apply to truncate instructions. e.g. on msp430, all
     /// instructions that define 8-bit values implicit zero-extend the result
     /// out to 16 bits.
-    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
     virtual bool isZExtFree(EVT VT1, EVT VT2) const;
 
     MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 846d093..ffd4318 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -20,8 +20,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "MSP430GenInstrInfo.inc"
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 1cc60bb..9049c4b 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 // FIXME: Provide proper call frame setup / destroy opcodes.
 MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
                                        const TargetInstrInfo &tii)
-  : MSP430GenRegisterInfo(), TM(tm), TII(tii) {
+  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm), TII(tii) {
   StackAlign = TM.getFrameLowering()->getStackAlignment();
 }
 
@@ -233,22 +233,8 @@ MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   }
 }
 
-unsigned MSP430RegisterInfo::getRARegister() const {
-  return MSP430::PCW;
-}
-
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
   return TFI->hasFP(MF) ? MSP430::FPW : MSP430::SPW;
 }
-
-int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("Not implemented yet!");
-  return 0;
-}
-
-int MSP430RegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
-  llvm_unreachable("Not implemented yet!");
-  return 0;
-}
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index fb70594..10a3d53 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -58,12 +58,7 @@ public:
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
-
-  //! Get DWARF debugging register number
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index b58c50a..3ee14d9 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "MSP430Subtarget.h"
 #include "MSP430.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 971f512..4dd8933 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -16,7 +16,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeMSP430Target() {
@@ -25,10 +25,11 @@ extern "C" void LLVMInitializeMSP430Target() {
 }
 
 MSP430TargetMachine::MSP430TargetMachine(const Target &T,
-                                         const std::string &TT,
-                                         const std::string &CPU,
-                                         const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+                                         StringRef TT,
+                                         StringRef CPU,
+                                         StringRef FS,
+                                         Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     // FIXME: Check TargetData string.
     DataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index 2a9eea0..eb483dc 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -38,8 +38,9 @@ class MSP430TargetMachine : public LLVMTargetMachine {
   MSP430FrameLowering    FrameLowering;
 
 public:
-  MSP430TargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS);
+  MSP430TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
 
   virtual const TargetFrameLowering *getFrameLowering() const {
     return &FrameLowering;
diff --git a/lib/Target/MSP430/TargetInfo/CMakeLists.txt b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
index 2d1aa9d..1526946 100644
--- a/lib/Target/MSP430/TargetInfo/CMakeLists.txt
+++ b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMMSP430Info
   MSP430TargetInfo.cpp
   )
 
-add_dependencies(LLVMMSP430Info MSP430CodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMSP430Info
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMMSP430Info MSP430CommonTableGen)
diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
index f9ca5c4..8b3e01e 100644
--- a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
+++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "MSP430.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheMSP430Target;
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index 46c687b..53ad155f 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -159,7 +159,7 @@ static void AddFastCallStdCallSuffix(SmallVectorImpl<char> &OutName,
   unsigned ArgWords = 0;
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
        AI != AE; ++AI) {
-    const Type *Ty = AI->getType();
+    Type *Ty = AI->getType();
     // 'Dereference' type in case of byval parameter attribute
     if (AI->hasByValAttr())
       Ty = cast<PointerType>(Ty)->getElementType();
@@ -214,7 +214,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
     
       // fastcall and stdcall functions usually need @42 at the end to specify
       // the argument info.
-      const FunctionType *FT = F->getFunctionType();
+      FunctionType *FT = F->getFunctionType();
       if ((CC == CallingConv::X86_FastCall || CC == CallingConv::X86_StdCall) &&
           // "Pure" variadic functions do not receive @0 suffix.
           (!FT->isVarArg() || FT->getNumParams() == 0 ||
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 36ab1a9..1b4329b 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -1,17 +1,20 @@
 set(LLVM_TARGET_DEFINITIONS Mips.td)
 
-tablegen(MipsGenRegisterInfo.inc -gen-register-info)
-tablegen(MipsGenInstrInfo.inc -gen-instr-info)
-tablegen(MipsGenAsmWriter.inc -gen-asm-writer)
-tablegen(MipsGenDAGISel.inc -gen-dag-isel)
-tablegen(MipsGenCallingConv.inc -gen-callingconv)
-tablegen(MipsGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(MipsGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(MipsGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(MipsGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(MipsGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(MipsGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(MipsGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(MipsCommonTableGen)
 
 add_llvm_target(MipsCodeGen
   MipsAsmPrinter.cpp
+  MipsCodeEmitter.cpp
   MipsDelaySlotFiller.cpp
   MipsEmitGPRestore.cpp
   MipsExpandPseudo.cpp
+  MipsJITInfo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
   MipsISelLowering.cpp
@@ -25,6 +28,19 @@ add_llvm_target(MipsCodeGen
   MipsSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMMipsCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMMipsAsmPrinter
+  LLVMMipsDesc
+  LLVMMipsInfo
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Mips/InstPrinter/CMakeLists.txt b/lib/Target/Mips/InstPrinter/CMakeLists.txt
index 8852fd4..c45b35d 100644
--- a/lib/Target/Mips/InstPrinter/CMakeLists.txt
+++ b/lib/Target/Mips/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMMipsAsmPrinter
   MipsInstPrinter.cpp
   )
-add_dependencies(LLVMMipsAsmPrinter MipsCodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMMipsAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMMipsAsmPrinter MipsCommonTableGen)
diff --git a/lib/Target/Mips/InstPrinter/Makefile b/lib/Target/Mips/InstPrinter/Makefile
index 63e38ef..74872a4 100644
--- a/lib/Target/Mips/InstPrinter/Makefile
+++ b/lib/Target/Mips/InstPrinter/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMMipsAsmPrinter
 
-# Hack: we need to include 'main' arm target directory to grab private headers
+# Hack: we need to include 'main' mips target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index 41c1dd3..3dafc61 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax --------===//
+//===-- MipsInstPrinter.cpp - Convert Mips MCInst to assembly syntax ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -69,8 +69,10 @@ void MipsInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << '$' << LowercaseString(getRegisterName(RegNo));
 }
 
-void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                StringRef Annot) {
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 680208e..5c11165 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -1,4 +1,4 @@
-//===-- MipsInstPrinter.h - Convert Mips MCInst to assembly syntax ----------===//
+//===-- MipsInstPrinter.h - Convert Mips MCInst to assembly syntax --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -86,7 +86,7 @@ public:
   
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
   
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 97de75d..2ceb5c9 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,15 @@
 add_llvm_library(LLVMMipsDesc
-  MipsMCTargetDesc.cpp
+  MipsAsmBackend.cpp
   MipsMCAsmInfo.cpp
+  MipsMCCodeEmitter.cpp
+  MipsMCTargetDesc.cpp
+  )
+
+add_llvm_library_dependencies(LLVMMipsDesc
+  LLVMMC
+  LLVMMipsAsmPrinter
+  LLVMMipsInfo
+  LLVMSupport
   )
+
+add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
new file mode 100644
index 0000000..f190ec4
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -0,0 +1,117 @@
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Object/MachOFormat.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+class MipsELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  MipsELFObjectWriter(bool is64Bit, Triple::OSType OSType, uint16_t EMachine,
+                      bool HasRelocationAddend)
+    : MCELFObjectTargetWriter(is64Bit, OSType, EMachine,
+                              HasRelocationAddend) {}
+};
+
+class MipsAsmBackend : public MCAsmBackend {
+public:
+  MipsAsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  unsigned getNumFixupKinds() const {
+    return 1;   //tbd
+  }
+
+  /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// data fragment, at the offset specified by the fixup and following the
+  /// fixup kind as appropriate.
+  void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value) const {
+  }
+
+  /// @name Target Relaxation Interfaces
+  /// @{
+
+  /// MayNeedRelaxation - Check whether the given instruction may need
+  /// relaxation.
+  ///
+  /// \param Inst - The instruction to test.
+  bool MayNeedRelaxation(const MCInst &Inst) const {
+    return false;
+  }
+
+  /// RelaxInstruction - Relax the instruction in the given fragment to the next
+  /// wider instruction.
+  ///
+  /// \param Inst - The instruction to relax, which may be the same as the
+  /// output.
+  /// \parm Res [output] - On return, the relaxed instruction.
+  void RelaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  }
+  
+  /// @}
+
+  /// WriteNopData - Write an (optimal) nop sequence of Count bytes to the given
+  /// output. If the target cannot generate such a sequence, it should return an
+  /// error.
+  ///
+  /// \return - True on success.
+  bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
+    return false;
+  }
+};
+
+class MipsEB_AsmBackend : public MipsAsmBackend {
+public:
+  Triple::OSType OSType;
+
+  MipsEB_AsmBackend(const Target &T, Triple::OSType _OSType)
+    : MipsAsmBackend(T), OSType(_OSType) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(createELFObjectTargetWriter(),
+                                 OS, /*IsLittleEndian*/ false);
+  }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new MipsELFObjectWriter(false, OSType, ELF::EM_MIPS, false);
+  }
+};
+
+class MipsEL_AsmBackend : public MipsAsmBackend {
+public:
+  Triple::OSType OSType;
+
+  MipsEL_AsmBackend(const Target &T, Triple::OSType _OSType)
+    : MipsAsmBackend(T), OSType(_OSType) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    return createELFObjectWriter(createELFObjectTargetWriter(),
+                                 OS, /*IsLittleEndian*/ true);
+  }
+
+  MCELFObjectTargetWriter *createELFObjectTargetWriter() const {
+    return new MipsELFObjectWriter(false, OSType, ELF::EM_MIPS, false);
+  }
+};
+}
+
+MCAsmBackend *llvm::createMipsAsmBackend(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+
+  // just return little endian for now
+  //
+  return new MipsEL_AsmBackend(T, Triple(TT).getOS());
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
new file mode 100644
index 0000000..f7a6fa9
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -0,0 +1,113 @@
+//===-- MipsBaseInfo.h - Top level definitions for ARM ------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the Mips target useful for the compiler back-end and the MC libraries.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MIPSBASEINFO_H
+#define MIPSBASEINFO_H
+
+#include "MipsMCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+/// getMipsRegisterNumbering - Given the enum value for some register,
+/// return the number that it corresponds to.
+inline static unsigned getMipsRegisterNumbering(unsigned RegEnum)
+{
+  switch (RegEnum) {
+  case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64:
+  case Mips::D0:
+    return 0;
+  case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64:
+    return 1;
+  case Mips::V0: case Mips::V0_64: case Mips::F2: case Mips::D2_64:
+  case Mips::D1:
+    return 2;
+  case Mips::V1: case Mips::V1_64: case Mips::F3: case Mips::D3_64:
+    return 3;
+  case Mips::A0: case Mips::A0_64: case Mips::F4: case Mips::D4_64:
+  case Mips::D2:
+    return 4;
+  case Mips::A1: case Mips::A1_64: case Mips::F5: case Mips::D5_64:
+    return 5;
+  case Mips::A2: case Mips::A2_64: case Mips::F6: case Mips::D6_64:
+  case Mips::D3:
+    return 6;
+  case Mips::A3: case Mips::A3_64: case Mips::F7: case Mips::D7_64:
+    return 7;
+  case Mips::T0: case Mips::T0_64: case Mips::F8: case Mips::D8_64:
+  case Mips::D4:
+    return 8;
+  case Mips::T1: case Mips::T1_64: case Mips::F9: case Mips::D9_64:
+    return 9;
+  case Mips::T2: case Mips::T2_64: case Mips::F10: case Mips::D10_64:
+  case Mips::D5:
+    return 10;
+  case Mips::T3: case Mips::T3_64: case Mips::F11: case Mips::D11_64:
+    return 11;
+  case Mips::T4: case Mips::T4_64: case Mips::F12: case Mips::D12_64:
+  case Mips::D6:
+    return 12;
+  case Mips::T5: case Mips::T5_64: case Mips::F13: case Mips::D13_64:
+    return 13;
+  case Mips::T6: case Mips::T6_64: case Mips::F14: case Mips::D14_64:
+  case Mips::D7:
+    return 14;
+  case Mips::T7: case Mips::T7_64: case Mips::F15: case Mips::D15_64:
+    return 15;
+  case Mips::S0: case Mips::S0_64: case Mips::F16: case Mips::D16_64:
+  case Mips::D8:
+    return 16;
+  case Mips::S1: case Mips::S1_64: case Mips::F17: case Mips::D17_64:
+    return 17;
+  case Mips::S2: case Mips::S2_64: case Mips::F18: case Mips::D18_64:
+  case Mips::D9:
+    return 18;
+  case Mips::S3: case Mips::S3_64: case Mips::F19: case Mips::D19_64:
+    return 19;
+  case Mips::S4: case Mips::S4_64: case Mips::F20: case Mips::D20_64:
+  case Mips::D10:
+    return 20;
+  case Mips::S5: case Mips::S5_64: case Mips::F21: case Mips::D21_64:
+    return 21;
+  case Mips::S6: case Mips::S6_64: case Mips::F22: case Mips::D22_64:
+  case Mips::D11:
+    return 22;
+  case Mips::S7: case Mips::S7_64: case Mips::F23: case Mips::D23_64:
+    return 23;
+  case Mips::T8: case Mips::T8_64: case Mips::F24: case Mips::D24_64:
+  case Mips::D12:
+    return 24;
+  case Mips::T9: case Mips::T9_64: case Mips::F25: case Mips::D25_64:
+    return 25;
+  case Mips::K0: case Mips::K0_64: case Mips::F26: case Mips::D26_64:
+  case Mips::D13:
+    return 26;
+  case Mips::K1: case Mips::K1_64: case Mips::F27: case Mips::D27_64:
+    return 27;
+  case Mips::GP: case Mips::GP_64: case Mips::F28: case Mips::D28_64:
+  case Mips::D14:
+    return 28;
+  case Mips::SP: case Mips::SP_64: case Mips::F29: case Mips::D29_64:
+    return 29;
+  case Mips::FP: case Mips::FP_64: case Mips::F30: case Mips::D30_64:
+  case Mips::D15: 
+    return 30;
+  case Mips::RA: case Mips::RA_64: case Mips::F31: case Mips::D31_64:
+    return 31;
+  default: llvm_unreachable("Unknown register number!");
+  }
+  return 0; // Not reached
+}
+}
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
new file mode 100644
index 0000000..8b099ea
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -0,0 +1,90 @@
+#ifndef LLVM_Mips_MipsFIXUPKINDS_H
+#define LLVM_Mips_MipsFIXUPKINDS_H
+
+//===-- Mips/MipsFixupKinds.h - Mips Specific Fixup Entries --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace Mips {
+    enum Fixups {
+        // fixup_Mips_xxx - R_MIPS_NONE
+        fixup_Mips_NONE = FirstTargetFixupKind,
+
+        // fixup_Mips_xxx - R_MIPS_16.
+        fixup_Mips_16,
+
+        // fixup_Mips_xxx - R_MIPS_32.
+        fixup_Mips_32,
+
+        // fixup_Mips_xxx - R_MIPS_REL32.
+        fixup_Mips_REL32,
+
+        // fixup_Mips_xxx - R_MIPS_26.
+        fixup_Mips_26,
+
+        // fixup_Mips_xxx - R_MIPS_HI16.
+        fixup_Mips_HI16,
+
+        // fixup_Mips_xxx - R_MIPS_LO16.
+        fixup_Mips_LO16,
+
+        // fixup_Mips_xxx - R_MIPS_GPREL16.
+        fixup_Mips_GPREL16,
+
+        // fixup_Mips_xxx - R_MIPS_LITERAL.
+        fixup_Mips_LITERAL,
+
+        // fixup_Mips_xxx - R_MIPS_GOT16.
+        fixup_Mips_GOT16,
+
+        // fixup_Mips_xxx - R_MIPS_PC16.
+        fixup_Mips_PC16,
+
+        // fixup_Mips_xxx - R_MIPS_CALL16.
+        fixup_Mips_CALL16,
+
+        // fixup_Mips_xxx - R_MIPS_GPREL32.
+        fixup_Mips_GPREL32,
+
+        // fixup_Mips_xxx - R_MIPS_SHIFT5.
+        fixup_Mips_SHIFT5,
+
+        // fixup_Mips_xxx - R_MIPS_SHIFT6.
+        fixup_Mips_SHIFT6,
+
+        // fixup_Mips_xxx - R_MIPS_64.
+        fixup_Mips_64,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_GD.
+        fixup_Mips_TLSGD,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_GOTTPREL.
+        fixup_Mips_GOTTPREL,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_TPREL_HI16.
+        fixup_Mips_TPREL_HI,
+
+        // fixup_Mips_xxx - R_MIPS_TLS_TPREL_LO16.
+        fixup_Mips_TPREL_LO,
+
+        // fixup_Mips_xxx - yyy. // This should become R_MIPS_PC16
+        fixup_Mips_Branch_PCRel,
+
+        // Marker
+        LastTargetFixupKind,
+        NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+    };
+} // namespace llvm
+} // namespace Mips
+
+
+#endif /* LLVM_Mips_MipsFIXUPKINDS_H */
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 5d92425..71ae804 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -18,7 +18,8 @@ using namespace llvm;
 
 MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::mips)
+  if ((TheTriple.getArch() == Triple::mips) ||
+      (TheTriple.getArch() == Triple::mips64))
     IsLittleEndian = false;
 
   AlignmentIsInBytes          = false;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
new file mode 100644
index 0000000..d66de23
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -0,0 +1,52 @@
+//===-- MipsMCCodeEmitter.cpp - Convert Mips code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MipsMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+//
+#define DEBUG_TYPE "mccodeemitter"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+
+using namespace llvm;
+
+namespace {
+class MipsMCCodeEmitter : public MCCodeEmitter {
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
+  const MCInstrInfo &MCII;
+  const MCSubtargetInfo &STI;
+
+public:
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                    MCContext &ctx)
+    : MCII(mcii), STI(sti) {}
+
+  ~MipsMCCodeEmitter() {}
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  }
+}; // class MipsMCCodeEmitter
+}  // namespace
+
+MCCodeEmitter *llvm::createMipsMCCodeEmitter(const MCInstrInfo &MCII,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new MipsMCCodeEmitter(MCII, STI, Ctx);
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 06f0d0b..1f9e3dd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -13,10 +13,14 @@
 
 #include "MipsMCTargetDesc.h"
 #include "MipsMCAsmInfo.h"
+#include "InstPrinter/MipsInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "MipsGenInstrInfo.inc"
@@ -35,11 +39,12 @@ static MCInstrInfo *createMipsMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeMipsMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheMipsTarget, createMipsMCInstrInfo);
+static MCRegisterInfo *createMipsMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitMipsMCRegisterInfo(X, Mips::RA);
+  return X;
 }
 
-
 static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
   MCSubtargetInfo *X = new MCSubtargetInfo();
@@ -47,12 +52,111 @@ static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeMipsMCSubtargetInfo() {
+static MCAsmInfo *createMipsMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new MipsMCAsmInfo(T, TT);
+
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(Mips::SP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                              CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default)
+    RM = Reloc::PIC_;
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
+                                              unsigned SyntaxVariant,
+                                              const MCAsmInfo &MAI,
+                                              const MCSubtargetInfo &STI) {
+  return new MipsInstPrinter(MAI);
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+extern "C" void LLVMInitializeMipsTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheMipsTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheMipselTarget, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn A(TheMips64Target, createMipsMCAsmInfo);
+  RegisterMCAsmInfoFn B(TheMips64elTarget, createMipsMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipsTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMipselTarget,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64Target,
+                                        createMipsMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheMips64elTarget,
+                                        createMipsMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheMipsTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMipselTarget, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64Target, createMipsMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheMips64elTarget, createMipsMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheMipsTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMipselTarget, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64Target, createMipsMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheMips64elTarget,
+                                    createMipsMCRegisterInfo);
+
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(TheMipsTarget, createMipsMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheMipselTarget,
+                                        createMipsMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64Target,
+                                        createMipsMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheMips64elTarget,
+                                        createMipsMCCodeEmitter);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheMipsTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMipselTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget, createMCStreamer);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheMipsTarget, createMipsAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheMipselTarget, createMipsAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64Target, createMipsAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheMips64elTarget, createMipsAsmBackend);
+
+  // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(TheMipsTarget,
                                           createMipsMCSubtargetInfo);
-}
+  TargetRegistry::RegisterMCSubtargetInfo(TheMipselTarget,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64Target,
+                                          createMipsMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheMips64elTarget,
+                                          createMipsMCSubtargetInfo);
 
-extern "C" void LLVMInitializeMipsMCAsmInfo() {
-  RegisterMCAsmInfo<MipsMCAsmInfo> X(TheMipsTarget);
-  RegisterMCAsmInfo<MipsMCAsmInfo> Y(TheMipselTarget);
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheMipsTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMipselTarget,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64Target,
+                                        createMipsMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheMips64elTarget,
+                                        createMipsMCInstPrinter);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 3d18f11..7a0042a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- AlphaMCTargetDesc.h - Alpha Target Descriptions ---------*- C++ -*-===//
+//===-- MipsMCTargetDesc.h - Mips Target Descriptions -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,21 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides Alpha specific target descriptions.
+// This file provides Mips specific target descriptions.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ALPHAMCTARGETDESC_H
-#define ALPHAMCTARGETDESC_H
+#ifndef MIPSMCTARGETDESC_H
+#define MIPSMCTARGETDESC_H
 
 namespace llvm {
+class MCAsmBackend;
+class MCInstrInfo;
+class MCCodeEmitter;
+class MCContext;
 class MCSubtargetInfo;
-class Target;
 class StringRef;
+class Target;
 
 extern Target TheMipsTarget;
 extern Target TheMipselTarget;
+extern Target TheMips64Target;
+extern Target TheMips64elTarget;
+
+MCCodeEmitter *createMipsMCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCSubtargetInfo &STI,
+                                       MCContext &Ctx);
 
+MCAsmBackend *createMipsAsmBackend(const Target &T, StringRef TT);
 } // End llvm namespace
 
 // Defines symbolic names for Mips registers.  This defines a mapping from
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 984b5ad..bacecf2 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -29,6 +29,9 @@ namespace llvm {
   FunctionPass *createMipsExpandPseudoPass(MipsTargetMachine &TM);
   FunctionPass *createMipsEmitGPRestorePass(MipsTargetMachine &TM);
 
+  FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+                                             JITCodeEmitter &JCE);
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 433cd57..39c2c16 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -38,6 +38,10 @@ def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
                                 "true", "Only supports single precision float">;
 def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
                                 "Enable o32 ABI">;
+def FeatureN32         : SubtargetFeature<"n32", "MipsABI", "N32",
+                                "Enable n32 ABI">;
+def FeatureN64         : SubtargetFeature<"n64", "MipsABI", "N64",
+                                "Enable n64 ABI">;
 def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
                                 "Enable eabi ABI">;
 def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
@@ -54,16 +58,19 @@ def FeatureSwap        : SubtargetFeature<"swap", "HasSwap", "true",
                                 "Enable 'byte/half swap' instructions.">;
 def FeatureBitCount    : SubtargetFeature<"bitcount", "HasBitCount", "true",
                                 "Enable 'count leading bits' instructions.">;
-def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
-                                "Mips1 ISA Support">;
-def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
-                                "Mips2 ISA Support">;
 def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
                                 "Mips32 ISA Support",
                                 [FeatureCondMov, FeatureBitCount]>;
 def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
                                 "Mips32r2", "Mips32r2 ISA Support",
                                 [FeatureMips32, FeatureSEInReg]>;
+def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
+                                "Mips64", "Mips64 ISA Support",
+                                [FeatureGP64Bit, FeatureFP64Bit,
+                                 FeatureMips32]>;
+def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
+                                "Mips64r2", "Mips64r2 ISA Support",
+                                [FeatureMips64, FeatureMips32r2]>;
 
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
@@ -72,21 +79,10 @@ def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
-def : Proc<"mips1", [FeatureMips1]>;
-def : Proc<"r2000", [FeatureMips1]>;
-def : Proc<"r3000", [FeatureMips1]>;
-
-def : Proc<"mips2", [FeatureMips2]>;
-def : Proc<"r6000", [FeatureMips2]>;
-
+def : Proc<"mips32r1", [FeatureMips32]>;
 def : Proc<"4ke", [FeatureMips32r2]>;
-
-// Allegrex is a 32bit subset of r4000, both for integer and fp registers,
-// but much more similar to Mips2 than Mips3. It also contains some of
-// Mips32/Mips32r2 instructions and a custom vector fpu processor.
-def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI,
-      FeatureVFPU, FeatureSEInReg, FeatureCondMov, FeatureMulDivAdd,
-      FeatureMinMax, FeatureSwap, FeatureBitCount]>;
+def : Proc<"mips64r1", [FeatureMips64]>;
+def : Proc<"mips64r2", [FeatureMips64r2]>;
 
 def MipsAsmWriter : AsmWriter {
   string AsmWriterClassName  = "InstPrinter";
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
new file mode 100644
index 0000000..49b0223
--- /dev/null
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -0,0 +1,214 @@
+//===- Mips64InstrInfo.td - Mips64 Instruction Information -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Instruction operand types
+def shamt_64       : Operand<i64>;
+
+// Unsigned Operand
+def uimm16_64      : Operand<i64> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// Transformation Function - get Imm - 32.
+def Subtract32 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() - 32);
+}]>;
+
+// imm32_63 predicate - True if imm is in range [32, 63].
+def imm32_63 : ImmLeaf<i64,
+                       [{return (int32_t)Imm >= 32 && (int32_t)Imm < 64;}],
+                       Subtract32>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+// Shifts
+class LogicR_shift_rotate_imm64<bits<6> func, bits<5> _rs, string instr_asm,
+                                SDNode OpNode, PatFrag PF>:
+  FR<0x00, func, (outs CPU64Regs:$dst), (ins CPU64Regs:$b, shamt_64:$c),
+     !strconcat(instr_asm, "\t$dst, $b, $c"),
+     [(set CPU64Regs:$dst, (OpNode CPU64Regs:$b, (i64 PF:$c)))],
+     IIAlu> {
+  let rs = _rs;
+}
+
+class LogicR_shift_rotate_reg64<bits<6> func, bits<5> _shamt, string instr_asm,
+                                SDNode OpNode>:
+  FR<0x00, func, (outs CPU64Regs:$dst), (ins CPU64Regs:$c, CPU64Regs:$b),
+     !strconcat(instr_asm, "\t$dst, $b, $c"),
+     [(set CPU64Regs:$dst, (OpNode CPU64Regs:$b, CPU64Regs:$c))], IIAlu> {
+  let shamt = _shamt;
+}
+
+// Mul, Div
+let Defs = [HI64, LO64] in {
+  let isCommutable = 1 in
+  class Mul64<bits<6> func, string instr_asm, InstrItinClass itin>:
+    FR<0x00, func, (outs), (ins CPU64Regs:$a, CPU64Regs:$b),
+       !strconcat(instr_asm, "\t$a, $b"), [], itin>;
+
+  class Div64<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
+              FR<0x00, func, (outs), (ins CPU64Regs:$a, CPU64Regs:$b),
+              !strconcat(instr_asm, "\t$$zero, $a, $b"),
+              [(op CPU64Regs:$a, CPU64Regs:$b)], itin>;
+}
+
+// Move from Hi/Lo
+let shamt = 0 in {
+let rs = 0, rt = 0 in
+class MoveFromLOHI64<bits<6> func, string instr_asm>:
+  FR<0x00, func, (outs CPU64Regs:$dst), (ins),
+     !strconcat(instr_asm, "\t$dst"), [], IIHiLo>;
+
+let rt = 0, rd = 0 in
+class MoveToLOHI64<bits<6> func, string instr_asm>:
+  FR<0x00, func, (outs), (ins CPU64Regs:$src),
+     !strconcat(instr_asm, "\t$src"), [], IIHiLo>;
+}
+
+// Count Leading Ones/Zeros in Word
+class CountLeading64<bits<6> func, string instr_asm, list<dag> pattern>:
+  FR<0x1c, func, (outs CPU64Regs:$dst), (ins CPU64Regs:$src),
+     !strconcat(instr_asm, "\t$dst, $src"), pattern, IIAlu>,
+     Requires<[HasBitCount]> {
+  let shamt = 0;
+  let rt = rd;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def DADDiu   : ArithLogicI<0x19, "daddiu", add, simm16_64, immSExt16,
+                           CPU64Regs>;
+def DANDi    : ArithLogicI<0x0c, "andi", and, uimm16_64, immZExt16, CPU64Regs>;
+def SLTi64   : SetCC_I<0x0a, "slti", setlt, simm16_64, immSExt16, CPU64Regs>;
+def SLTiu64  : SetCC_I<0x0b, "sltiu", setult, simm16_64, immSExt16, CPU64Regs>;
+def ORi64    : ArithLogicI<0x0d, "ori", or, uimm16_64, immZExt16, CPU64Regs>;
+def XORi64   : ArithLogicI<0x0e, "xori", xor, uimm16_64, immZExt16, CPU64Regs>;
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def DADDu    : ArithLogicR<0x00, 0x2d, "daddu", add, IIAlu, CPU64Regs, 1>;
+def DSUBu    : ArithLogicR<0x00, 0x2f, "dsubu", sub, IIAlu, CPU64Regs>;
+def SLT64    : SetCC_R<0x00, 0x2a, "slt", setlt, CPU64Regs>;
+def SLTu64   : SetCC_R<0x00, 0x2b, "sltu", setult, CPU64Regs>;
+def AND64    : ArithLogicR<0x00, 0x24, "and", and, IIAlu, CPU64Regs, 1>;
+def OR64     : ArithLogicR<0x00, 0x25, "or", or, IIAlu, CPU64Regs, 1>;
+def XOR64    : ArithLogicR<0x00, 0x26, "xor", xor, IIAlu, CPU64Regs, 1>;
+def NOR64    : LogicNOR<0x00, 0x27, "nor", CPU64Regs>;
+
+/// Shift Instructions
+def DSLL     : LogicR_shift_rotate_imm64<0x38, 0x00, "dsll", shl, immZExt5>;
+def DSRL     : LogicR_shift_rotate_imm64<0x3a, 0x00, "dsrl", srl, immZExt5>;
+def DSRA     : LogicR_shift_rotate_imm64<0x3b, 0x00, "dsra", sra, immZExt5>;
+def DSLL32   : LogicR_shift_rotate_imm64<0x3c, 0x00, "dsll32", shl, imm32_63>;
+def DSRL32   : LogicR_shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl, imm32_63>;
+def DSRA32   : LogicR_shift_rotate_imm64<0x3f, 0x00, "dsra32", sra, imm32_63>;
+def DSLLV    : LogicR_shift_rotate_reg64<0x24, 0x00, "dsllv", shl>;
+def DSRLV    : LogicR_shift_rotate_reg64<0x26, 0x00, "dsrlv", srl>;
+def DSRAV    : LogicR_shift_rotate_reg64<0x27, 0x00, "dsrav", sra>;
+
+// Rotate Instructions
+let Predicates = [HasMips64r2] in {
+  def DROTR    : LogicR_shift_rotate_imm64<0x3a, 0x01, "drotr", rotr, immZExt5>;
+  def DROTR32  : LogicR_shift_rotate_imm64<0x3e, 0x01, "drotr32", rotr,
+                                           imm32_63>;
+  def DROTRV   : LogicR_shift_rotate_reg64<0x16, 0x01, "drotrv", rotr>;
+}
+
+/// Load and Store Instructions
+///  aligned 
+defm LB64    : LoadM64<0x20, "lb",  sextloadi8>;
+defm LBu64   : LoadM64<0x24, "lbu", zextloadi8>;
+defm LH64    : LoadM64<0x21, "lh",  sextloadi16_a>;
+defm LHu64   : LoadM64<0x25, "lhu", zextloadi16_a>;
+defm LW64    : LoadM64<0x23, "lw",  sextloadi32_a>;
+defm LWu64   : LoadM64<0x27, "lwu", zextloadi32_a>;
+defm SB64    : StoreM64<0x28, "sb", truncstorei8>;
+defm SH64    : StoreM64<0x29, "sh", truncstorei16_a>;
+defm SW64    : StoreM64<0x2b, "sw", truncstorei32_a>;
+defm LD      : LoadM64<0x37, "ld",  load_a>;
+defm SD      : StoreM64<0x3f, "sd", store_a>;
+
+///  unaligned
+defm ULH64     : LoadM64<0x21, "ulh",  sextloadi16_u, 1>;
+defm ULHu64    : LoadM64<0x25, "ulhu", zextloadi16_u, 1>;
+defm ULW64     : LoadM64<0x23, "ulw",  sextloadi32_u, 1>;
+defm USH64     : StoreM64<0x29, "ush", truncstorei16_u, 1>;
+defm USW64     : StoreM64<0x2b, "usw", truncstorei32_u, 1>;
+defm ULD       : LoadM64<0x37, "uld",  load_u, 1>;
+defm USD       : StoreM64<0x3f, "usd", store_u, 1>;
+
+/// Jump and Branch Instructions
+def BEQ64  : CBranch<0x04, "beq", seteq, CPU64Regs>;
+def BNE64  : CBranch<0x05, "bne", setne, CPU64Regs>;
+def BGEZ64 : CBranchZero<0x01, 1, "bgez", setge, CPU64Regs>;
+def BGTZ64 : CBranchZero<0x07, 0, "bgtz", setgt, CPU64Regs>;
+def BLEZ64 : CBranchZero<0x07, 0, "blez", setle, CPU64Regs>;
+def BLTZ64 : CBranchZero<0x01, 0, "bltz", setlt, CPU64Regs>;
+
+/// Multiply and Divide Instructions.
+def DMULT    : Mul64<0x1c, "dmult", IIImul>;
+def DMULTu   : Mul64<0x1d, "dmultu", IIImul>;
+def DSDIV    : Div64<MipsDivRem, 0x1e, "ddiv", IIIdiv>;
+def DUDIV    : Div64<MipsDivRemU, 0x1f, "ddivu", IIIdiv>;
+
+let Defs = [HI64] in
+  def MTHI64  : MoveToLOHI64<0x11, "mthi">;
+let Defs = [LO64] in
+  def MTLO64  : MoveToLOHI64<0x13, "mtlo">;
+
+let Uses = [HI64] in
+  def MFHI64  : MoveFromLOHI64<0x10, "mfhi">;
+let Uses = [LO64] in
+  def MFLO64  : MoveFromLOHI64<0x12, "mflo">;
+
+/// Count Leading
+def DCLZ : CountLeading64<0x24, "dclz",
+                          [(set CPU64Regs:$dst, (ctlz CPU64Regs:$src))]>;
+def DCLO : CountLeading64<0x25, "dclo",
+                          [(set CPU64Regs:$dst, (ctlz (not CPU64Regs:$src)))]>;
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Small immediates
+def : Pat<(i64 immSExt16:$in),
+          (DADDiu ZERO_64, imm:$in)>;
+def : Pat<(i64 immZExt16:$in),
+          (ORi64 ZERO_64, imm:$in)>;
+
+// zextloadi32_u
+def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>,
+      Requires<[IsN64]>;
+def : Pat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>,
+      Requires<[NotN64]>;
+
+// hi/lo relocs
+def : Pat<(i64 (MipsLo tglobaladdr:$in)), (DADDiu ZERO_64, tglobaladdr:$in)>;
+
+defm : BrcondPats<CPU64Regs, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
+                  ZERO_64>;
+
+// setcc patterns
+defm : SeteqPats<CPU64Regs, SLTiu64, XOR64, SLTu64, ZERO_64>;
+defm : SetlePats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgtPats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgePats<CPU64Regs, SLT64, SLTu64>;
+defm : SetgeImmPats<CPU64Regs, SLTi64, SLTiu64>;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 69e03bd..0e82681 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -18,6 +18,7 @@
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsMCInstLower.h"
+#include "MipsMCSymbolRefExpr.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Instructions.h"
@@ -25,6 +26,7 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
@@ -33,15 +35,22 @@
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Analysis/DebugInfo.h"
 
 using namespace llvm;
 
+static bool isUnalignedLoadStore(unsigned Opc) {
+  return Opc == Mips::ULW    || Opc == Mips::ULH    || Opc == Mips::ULHu ||
+         Opc == Mips::USW    || Opc == Mips::USH    ||
+         Opc == Mips::ULW_P8 || Opc == Mips::ULH_P8 || Opc == Mips::ULHu_P8 ||
+         Opc == Mips::USW_P8 || Opc == Mips::USH_P8;
+}
+
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
@@ -52,8 +61,21 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   MipsMCInstLower MCInstLowering(Mang, *MF, *this);
+  unsigned Opc = MI->getOpcode();
   MCInst TmpInst0;
   MCInstLowering.Lower(MI, TmpInst0);
+  
+  // Enclose unaligned load or store with .macro & .nomacro directives.
+  if (isUnalignedLoadStore(Opc)) {
+    MCInst Directive;
+    Directive.setOpcode(Mips::MACRO);
+    OutStreamer.EmitInstruction(Directive);
+    OutStreamer.EmitInstruction(TmpInst0);
+    Directive.setOpcode(Mips::NOMACRO);
+    OutStreamer.EmitInstruction(Directive);
+    return;
+  }
+
   OutStreamer.EmitInstruction(TmpInst0);
 }
 
@@ -180,7 +202,6 @@ void MipsAsmPrinter::emitFrameDirective() {
 const char *MipsAsmPrinter::getCurrentABIString() const {
   switch (Subtarget->getTargetABI()) {
   case MipsSubtarget::O32:  return "abi32";
-  case MipsSubtarget::O64:  return "abiO64";
   case MipsSubtarget::N32:  return "abiN32";
   case MipsSubtarget::N64:  return "abi64";
   case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
@@ -304,6 +325,11 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case MipsII::MO_GOTTPREL: O << "%gottprel("; break;
   case MipsII::MO_TPREL_HI: O << "%tprel_hi("; break;
   case MipsII::MO_TPREL_LO: O << "%tprel_lo("; break;
+  case MipsII::MO_GPOFF_HI: O << "%hi(%neg(%gp_rel("; break;
+  case MipsII::MO_GPOFF_LO: O << "%lo(%neg(%gp_rel("; break;
+  case MipsII::MO_GOT_DISP: O << "%got_disp("; break;
+  case MipsII::MO_GOT_PAGE: O << "%got_page("; break;
+  case MipsII::MO_GOT_OFST: O << "%got_ofst("; break;
   }
 
   switch (MO.getType()) {
@@ -424,17 +450,9 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 }
 
 // Force static initialization.
-static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
-                                              unsigned SyntaxVariant,
-                                              const MCAsmInfo &MAI) {
-  return new MipsInstPrinter(MAI);
-}
-
 extern "C" void LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
   RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget);
-
-  TargetRegistry::RegisterMCInstPrinter(TheMipsTarget, createMipsMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheMipselTarget,
-                                        createMipsMCInstPrinter);
+  RegisterAsmPrinter<MipsAsmPrinter> A(TheMips64Target);
+  RegisterAsmPrinter<MipsAsmPrinter> B(TheMips64elTarget);
 }
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 876f0fc..0ae4ef6 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -31,6 +31,55 @@ def RetCC_MipsO32 : CallingConv<[
 ]>;
 
 //===----------------------------------------------------------------------===//
+// Mips N32/64 Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsN : CallingConv<[
+  // FIXME: Handle byval, complex and float double parameters.
+
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i64], CCAssignToRegWithShadow<[A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64],
+                                          [D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64]>>,
+
+  // f32 arguments are passed in single precision FP registers.
+  CCIfType<[f32], CCAssignToRegWithShadow<[F12, F13, F14, F15,
+                                           F16, F17, F18, F19],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // f64 arguments are passed in double precision FP registers.
+  CCIfType<[f64], CCAssignToRegWithShadow<[D12_64, D13_64, D14_64, D15_64,
+                                           D16_64, D17_64, D18_64, D19_64],
+                                          [A0_64, A1_64, A2_64, A3_64,
+                                           T0_64, T1_64, T2_64, T3_64]>>,
+
+  // All stack parameter slots become 64-bit doublewords and are 8-byte aligned.
+  CCIfType<[i64, f64], CCAssignToStack<8, 8>>,
+  CCIfType<[f32], CCAssignToStack<4, 8>>
+]>;
+
+def RetCC_MipsN : CallingConv<[
+  // FIXME: Handle complex and float double return values.
+
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // i64 are returned in registers V0_64, V1_64
+  CCIfType<[i64], CCAssignToReg<[V0_64, V1_64]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 are returned in registers D0, D2
+  CCIfType<[f64], CCAssignToReg<[D0_64, D2_64]>>
+]>;
+
+//===----------------------------------------------------------------------===//
 // Mips EABI Calling Convention
 //===----------------------------------------------------------------------===//
 
@@ -77,10 +126,14 @@ def RetCC_MipsEABI : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_Mips : CallingConv<[
-  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
+  CCIfSubtarget<"isABI_N32()", CCDelegateTo<CC_MipsN>>,
+  CCIfSubtarget<"isABI_N64()", CCDelegateTo<CC_MipsN>>
 ]>;
 
 def RetCC_Mips : CallingConv<[
   CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
+  CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
+  CCIfSubtarget<"isABI_N64()", CCDelegateTo<RetCC_MipsN>>,
   CCDelegateTo<RetCC_MipsO32>
 ]>;
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
new file mode 100644
index 0000000..9220d9c
--- /dev/null
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -0,0 +1,245 @@
+//===-- Mips/MipsCodeEmitter.cpp - Convert Mips code to machine code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Mips machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+
+#include "llvm/CodeGen/MachineOperand.h"
+
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class MipsCodeEmitter : public MachineFunctionPass {
+  MipsJITInfo *JTI;
+  const MipsInstrInfo *II;
+  const TargetData *TD;
+  const MipsSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  const std::vector<MachineJumpTableEntry> *MJTEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+  public:
+    MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce) :
+      MachineFunctionPass(ID), JTI(0),
+        II((const MipsInstrInfo *) tm.getInstrInfo()),
+        TD(tm.getTargetData()), TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_) {
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "Mips Machine Code Emitter";
+    }
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+
+    void emitWordLE(unsigned Word);
+
+    /// Routines that handle operands which add machine relocations which are
+    /// fixed up by the relocation stage.
+    void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                bool MayNeedFarStub) const;
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const;
+    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+
+    /// getMachineOpValue - Return binary encoding of operand. If the machine
+    /// operand requires relocation, record the relocation and return zero.
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO) const;
+
+    unsigned getRelocation(const MachineInstr &MI,
+                           const MachineOperand &MO) const;
+
+  };
+}
+
+char MipsCodeEmitter::ID = 0;
+
+bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  JTI = ((MipsTargetMachine&) MF.getTarget()).getJITInfo();
+  II = ((const MipsTargetMachine&) MF.getTarget()).getInstrInfo();
+  TD = ((const MipsTargetMachine&) MF.getTarget()).getTargetData();
+  Subtarget = &TM.getSubtarget<MipsSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = 0;
+  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getFunction()->getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+          I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+unsigned MipsCodeEmitter::getRelocation(const MachineInstr &MI,
+                                        const MachineOperand &MO) const {
+  // NOTE: This relocations are for static.
+  uint64_t TSFlags = MI.getDesc().TSFlags;
+  uint64_t Form = TSFlags & MipsII::FormMask;
+  if (Form == MipsII::FrmJ)
+    return Mips::reloc_mips_26;
+  if ((Form == MipsII::FrmI || Form == MipsII::FrmFI)
+       && MI.getDesc().isBranch())
+    return Mips::reloc_mips_branch;
+  if (Form == MipsII::FrmI && MI.getOpcode() == Mips::LUi)
+    return Mips::reloc_mips_hi;
+  return Mips::reloc_mips_lo;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) const {
+  if (MO.isReg())
+    return MipsRegisterInfo::getRegisterNumbering(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+
+void MipsCodeEmitter::emitGlobalAddress(const GlobalValue *GV, unsigned Reloc,
+                                                bool MayNeedFarStub) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                             const_cast<GlobalValue *>(GV), 0, MayNeedFarStub));
+}
+
+void MipsCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0, false));
+}
+
+void MipsCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void MipsCodeEmitter::
+emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, false));
+}
+
+void MipsCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                           unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+void MipsCodeEmitter::emitInstruction(const MachineInstr &MI) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  // Skip pseudo instructions.
+  if ((MI.getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo)
+    return;
+
+  ++NumEmitted;  // Keep track of the # of mi's emitted
+
+  switch (MI.getOpcode()) {
+  default:
+    emitWordLE(getBinaryCodeForInstr(MI));
+    break;
+  }
+
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
+
+void MipsCodeEmitter::emitWordLE(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  MCE.emitWordLE(Word);
+}
+
+/// createMipsJITCodeEmitterPass - Return a pass that emits the collected Mips
+/// code to the specified MCE object.
+FunctionPass *llvm::createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
+    JITCodeEmitter &JCE) {
+  return new MipsCodeEmitter(TM, JCE);
+}
+
+unsigned MipsCodeEmitter::getBinaryCodeForInstr(const MachineInstr &MI) const {
+ // this function will be automatically generated by the CodeEmitterGenerator
+ // using TableGen
+ return 0;
+}
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index c3a6211..be3b7a0 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Simple pass to fills delay slots with NOPs.
+// Simple pass to fills delay slots with useful instructions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,18 +17,31 @@
 #include "MipsTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 
 using namespace llvm;
 
 STATISTIC(FilledSlots, "Number of delay slots filled");
+STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
+                       " are not NOP.");
+
+static cl::opt<bool> EnableDelaySlotFiller(
+  "enable-mips-delay-filler",
+  cl::init(false),
+  cl::desc("Fill the Mips delay slots useful instructions."),
+  cl::Hidden);
 
 namespace {
   struct Filler : public MachineFunctionPass {
 
     TargetMachine &TM;
     const TargetInstrInfo *TII;
+    MachineBasicBlock::iterator LastFiller;
 
     static char ID;
     Filler(TargetMachine &tm)
@@ -47,31 +60,61 @@ namespace {
       return Changed;
     }
 
+    bool isDelayFiller(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator candidate);
+
+    void insertCallUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegDefs,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    void insertDefsUses(MachineBasicBlock::iterator MI,
+                        SmallSet<unsigned, 32>& RegDefs,
+                        SmallSet<unsigned, 32>& RegUses);
+
+    bool IsRegInSet(SmallSet<unsigned, 32>& RegSet,
+                    unsigned Reg);
+
+    bool delayHasHazard(MachineBasicBlock::iterator candidate,
+                        bool &sawLoad, bool &sawStore,
+                        SmallSet<unsigned, 32> &RegDefs,
+                        SmallSet<unsigned, 32> &RegUses);
+
+    bool
+    findDelayInstr(MachineBasicBlock &MBB, MachineBasicBlock::iterator slot,
+                   MachineBasicBlock::iterator &Filler);
+
+
   };
   char Filler::ID = 0;
 } // end of anonymous namespace
 
 /// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
-/// Currently, we fill delay slots with NOPs. We assume there is only one
-/// delay slot per delayed instruction.
+/// We assume there is only one delay slot per delayed instruction.
 bool Filler::
-runOnMachineBasicBlock(MachineBasicBlock &MBB)
-{
+runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
-    const MCInstrDesc& MCid = I->getDesc();
-    if (MCid.hasDelaySlot() &&
-        (TM.getSubtarget<MipsSubtarget>().isMips1() ||
-         MCid.isCall() || MCid.isBranch() || MCid.isReturn())) {
-      MachineBasicBlock::iterator J = I;
-      ++J;
-      BuildMI(MBB, J, I->getDebugLoc(), TII->get(Mips::NOP));
+  LastFiller = MBB.end();
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->getDesc().hasDelaySlot()) {
       ++FilledSlots;
       Changed = true;
-    }
-  }
 
+      MachineBasicBlock::iterator D;
+
+      if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) {
+        MBB.splice(llvm::next(I), &MBB, D);
+        ++UsefulSlots;
+      }
+      else 
+        BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
+
+      // Record the filler instruction that filled the delay slot.
+      // The instruction after it will be visited in the next iteration.
+      LastFiller = ++I;
+     }
   return Changed;
+
 }
 
 /// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
@@ -80,3 +123,134 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
   return new Filler(tm);
 }
 
+bool Filler::findDelayInstr(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator slot,
+                            MachineBasicBlock::iterator &Filler) {
+  SmallSet<unsigned, 32> RegDefs;
+  SmallSet<unsigned, 32> RegUses;
+
+  insertDefsUses(slot, RegDefs, RegUses);
+
+  bool sawLoad = false;
+  bool sawStore = false;
+
+  for (MachineBasicBlock::reverse_iterator I(slot); I != MBB.rend(); ++I) {
+    // skip debug value
+    if (I->isDebugValue())
+      continue;
+
+    // Convert to forward iterator.
+    MachineBasicBlock::iterator FI(llvm::next(I).base());
+
+    if (I->hasUnmodeledSideEffects()
+        || I->isInlineAsm()
+        || I->isLabel()
+        || FI == LastFiller
+        || I->getDesc().isPseudo()
+        //
+        // Should not allow:
+        // ERET, DERET or WAIT, PAUSE. Need to add these to instruction
+        // list. TBD.
+        )
+      break;
+
+    if (delayHasHazard(FI, sawLoad, sawStore, RegDefs, RegUses)) {
+      insertDefsUses(FI, RegDefs, RegUses);
+      continue;
+    }
+
+    Filler = FI;
+    return true;
+  }
+
+  return false;
+}
+
+bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
+                            bool &sawLoad,
+                            bool &sawStore,
+                            SmallSet<unsigned, 32> &RegDefs,
+                            SmallSet<unsigned, 32> &RegUses) {
+  if (candidate->isImplicitDef() || candidate->isKill())
+    return true;
+
+  MCInstrDesc MCID = candidate->getDesc();
+  // Loads or stores cannot be moved past a store to the delay slot
+  // and stores cannot be moved past a load. 
+  if (MCID.mayLoad()) {
+    if (sawStore)
+      return true;
+    sawLoad = true;
+  }
+
+  if (MCID.mayStore()) {
+    if (sawStore)
+      return true;
+    sawStore = true;
+    if (sawLoad)
+      return true;
+  }
+
+  assert((!MCID.isCall() && !MCID.isReturn()) &&
+         "Cannot put calls or returns in delay slot.");
+
+  for (unsigned i = 0, e = candidate->getNumOperands(); i!= e; ++i) {
+    const MachineOperand &MO = candidate->getOperand(i);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue; // skip
+
+    if (MO.isDef()) {
+      // check whether Reg is defined or used before delay slot.
+      if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
+        return true;
+    }
+    if (MO.isUse()) {
+      // check whether Reg is defined before delay slot.
+      if (IsRegInSet(RegDefs, Reg))
+        return true;
+    }
+  }
+  return false;
+}
+
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses) {
+  // If MI is a call or return, just examine the explicit non-variadic operands.
+  MCInstrDesc MCID = MI->getDesc();
+  unsigned e = MCID.isCall() || MCID.isReturn() ? MCID.getNumOperands() :
+                                                  MI->getNumOperands();
+  
+  // Add RA to RegDefs to prevent users of RA from going into delay slot. 
+  if (MCID.isCall())
+    RegDefs.insert(Mips::RA);
+
+  for (unsigned i = 0; i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    unsigned Reg;
+
+    if (!MO.isReg() || !(Reg = MO.getReg()))
+      continue;
+
+    if (MO.isDef())
+      RegDefs.insert(Reg);
+    else if (MO.isUse())
+      RegUses.insert(Reg);
+  }
+}
+
+//returns true if the Reg or its alias is in the RegSet.
+bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg) {
+  if (RegSet.count(Reg))
+    return true;
+  // check Aliased Registers
+  for (const unsigned *Alias = TM.getRegisterInfo()->getAliasSet(Reg);
+       *Alias; ++Alias)
+    if (RegSet.count(*Alias))
+      return true;
+
+  return false;
+}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index a0f90a0..22d1e47 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -254,9 +254,15 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   }
 
   // Restore GP from the saved stack location
-  if (MipsFI->needGPSaveRestore())
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE))
-      .addImm(MFI->getObjectOffset(MipsFI->getGPFI()));
+  if (MipsFI->needGPSaveRestore()) {
+    unsigned Offset = MFI->getObjectOffset(MipsFI->getGPFI());
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE)).addImm(Offset);
+
+    if (Offset >= 0x8000) {
+      BuildMI(MBB, llvm::prior(MBBI), dl, TII.get(Mips::MACRO));
+      BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
+    }
+  }
 }
 
 void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -300,13 +306,6 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void
-MipsFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(Mips::SP, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 void MipsFrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 78c78ee..c249756 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -27,7 +27,8 @@ protected:
 
 public:
   explicit MipsFrameLowering(const MipsSubtarget &sti)
-    : TargetFrameLowering(StackGrowsDown, 8, 0), STI(sti) {
+    : TargetFrameLowering(StackGrowsDown, sti.hasMips64() ? 16 : 8, 0),
+      STI(sti) {
   }
 
   bool targetHandlesStackFrameRounding() const;
@@ -39,8 +40,6 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
 
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
-  
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS) const;
 };
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 90aaeb6..9c831ed 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -86,9 +86,6 @@ private:
   // Complex Pattern.
   bool SelectAddr(SDValue N, SDValue &Base, SDValue &Offset);
 
-  SDNode *SelectLoadFp64(SDNode *N);
-  SDNode *SelectStoreFp64(SDNode *N);
-
   // getI32Imm - Return a target constant with the specified
   // value, of type i32.
   inline SDValue getI32Imm(unsigned Imm) {
@@ -114,17 +111,20 @@ SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
 /// Used on Mips Load/Store instructions
 bool MipsDAGToDAGISel::
 SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
+  EVT ValTy = Addr.getValueType();
+  unsigned GPReg = ValTy == MVT::i32 ? Mips::GP : Mips::GP_64;
+
   // if Address is FI, get the TargetFrameIndex.
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
     return true;
   }
 
   // on PIC code Load GA
   if (TM.getRelocationModel() == Reloc::PIC_) {
     if (Addr.getOpcode() == MipsISD::WrapperPIC) {
-      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Base   = CurDAG->getRegister(GPReg, ValTy);
       Offset = Addr.getOperand(0);
       return true;
     }
@@ -133,7 +133,7 @@ SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
         Addr.getOpcode() == ISD::TargetGlobalAddress))
       return false;
     else if (Addr.getOpcode() == ISD::TargetGlobalTLSAddress) {
-      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Base   = CurDAG->getRegister(GPReg, ValTy);
       Offset = Addr;
       return true;
     }
@@ -147,11 +147,11 @@ SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
       // If the first operand is a FI, get the TargetFI Node
       if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
                                   (Addr.getOperand(0)))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
       else
         Base = Addr.getOperand(0);
 
-      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
       return true;
     }
   }
@@ -180,134 +180,10 @@ SelectAddr(SDValue Addr, SDValue &Base, SDValue &Offset) {
   }
 
   Base   = Addr;
-  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  Offset = CurDAG->getTargetConstant(0, ValTy);
   return true;
 }
 
-SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
-  MVT::SimpleValueType NVT =
-    N->getValueType(0).getSimpleVT().SimpleTy;
-
-  if (!Subtarget.isMips1() || NVT != MVT::f64)
-    return NULL;
-
-  LoadSDNode *LN = cast<LoadSDNode>(N);
-  if (LN->getExtensionType() != ISD::NON_EXTLOAD ||
-      LN->getAddressingMode() != ISD::UNINDEXED)
-    return NULL;
-
-  SDValue Chain = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDValue Offset0, Offset1, Base;
-
-  if (!SelectAddr(N1, Base, Offset0) ||
-      N1.getValueType() != MVT::i32)
-    return NULL;
-
-  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
-  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
-  DebugLoc dl = N->getDebugLoc();
-
-  // The second load should start after for 4 bytes.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
-    Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
-  else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Offset0))
-    Offset1 = CurDAG->getTargetConstantPool(CP->getConstVal(),
-                                            MVT::i32,
-                                            CP->getAlignment(),
-                                            CP->getOffset()+4,
-                                            CP->getTargetFlags());
-  else
-    return NULL;
-
-  // Choose the offsets depending on the endianess
-  if (TM.getTargetData()->isBigEndian())
-    std::swap(Offset0, Offset1);
-
-  // Instead of:
-  //    ldc $f0, X($3)
-  // Generate:
-  //    lwc $f0, X($3)
-  //    lwc $f1, X+4($3)
-  SDNode *LD0 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32,
-                                       MVT::Other, Base, Offset0, Chain);
-  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, NVT), 0);
-  SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl,
-                            MVT::f64, Undef, SDValue(LD0, 0));
-
-  SDNode *LD1 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32,
-                                       MVT::Other, Base, Offset1, SDValue(LD0, 1));
-  SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl,
-                            MVT::f64, I0, SDValue(LD1, 0));
-
-  ReplaceUses(SDValue(N, 0), I1);
-  ReplaceUses(SDValue(N, 1), Chain);
-  cast<MachineSDNode>(LD0)->setMemRefs(MemRefs0, MemRefs0 + 1);
-  cast<MachineSDNode>(LD1)->setMemRefs(MemRefs0, MemRefs0 + 1);
-  return I1.getNode();
-}
-
-SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) {
-
-  if (!Subtarget.isMips1() ||
-      N->getOperand(1).getValueType() != MVT::f64)
-    return NULL;
-
-  SDValue Chain = N->getOperand(0);
-
-  StoreSDNode *SN = cast<StoreSDNode>(N);
-  if (SN->isTruncatingStore() || SN->getAddressingMode() != ISD::UNINDEXED)
-    return NULL;
-
-  SDValue N1 = N->getOperand(1);
-  SDValue N2 = N->getOperand(2);
-  SDValue Offset0, Offset1, Base;
-
-  if (!SelectAddr(N2, Base, Offset0) ||
-      N1.getValueType() != MVT::f64 ||
-      N2.getValueType() != MVT::i32)
-    return NULL;
-
-  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
-  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
-  DebugLoc dl = N->getDebugLoc();
-
-  // Get the even and odd part from the f64 register
-  SDValue FPOdd = CurDAG->getTargetExtractSubreg(Mips::sub_fpodd,
-                                                 dl, MVT::f32, N1);
-  SDValue FPEven = CurDAG->getTargetExtractSubreg(Mips::sub_fpeven,
-                                                 dl, MVT::f32, N1);
-
-  // The second store should start after for 4 bytes.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
-    Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
-  else
-    return NULL;
-
-  // Choose the offsets depending on the endianess
-  if (TM.getTargetData()->isBigEndian())
-    std::swap(Offset0, Offset1);
-
-  // Instead of:
-  //    sdc $f0, X($3)
-  // Generate:
-  //    swc $f0, X($3)
-  //    swc $f1, X+4($3)
-  SDValue Ops0[] = { FPEven, Base, Offset0, Chain };
-  Chain = SDValue(CurDAG->getMachineNode(Mips::SWC1, dl,
-                                       MVT::Other, Ops0, 4), 0);
-  cast<MachineSDNode>(Chain.getNode())->setMemRefs(MemRefs0, MemRefs0 + 1);
-
-  SDValue Ops1[] = { FPOdd, Base, Offset1, Chain };
-  Chain = SDValue(CurDAG->getMachineNode(Mips::SWC1, dl,
-                                       MVT::Other, Ops1, 4), 0);
-  cast<MachineSDNode>(Chain.getNode())->setMemRefs(MemRefs0, MemRefs0 + 1);
-
-  ReplaceUses(SDValue(N, 0), Chain);
-  return Chain.getNode();
-}
-
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
@@ -364,6 +240,8 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     /// Mul with two results
     case ISD::SMUL_LOHI:
     case ISD::UMUL_LOHI: {
+      assert(Node->getValueType(0) != MVT::i64 &&
+             "64-bit multiplication with two results not handled.");
       SDValue Op1 = Node->getOperand(0);
       SDValue Op2 = Node->getOperand(1);
 
@@ -389,21 +267,29 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 
     /// Special Muls
     case ISD::MUL:
-      if (Subtarget.isMips32())
+      // Mips32 has a 32-bit three operand mul instruction.
+      if (Subtarget.hasMips32() && Node->getValueType(0) == MVT::i32)
         break;
     case ISD::MULHS:
     case ISD::MULHU: {
+      assert((Opcode == ISD::MUL || Node->getValueType(0) != MVT::i64) &&
+             "64-bit MULH* not handled.");
+      EVT Ty = Node->getValueType(0);
       SDValue MulOp1 = Node->getOperand(0);
       SDValue MulOp2 = Node->getOperand(1);
 
-      unsigned MulOp  = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+      unsigned MulOp  = (Opcode == ISD::MULHU ?
+                         Mips::MULTu :
+                         (Ty == MVT::i32 ? Mips::MULT : Mips::DMULT));
       SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl,
                                                MVT::Glue, MulOp1, MulOp2);
 
       SDValue InFlag = SDValue(MulNode, 0);
 
-      if (Opcode == ISD::MUL)
-        return CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, InFlag);
+      if (Opcode == ISD::MUL) {
+        unsigned Opc = (Ty == MVT::i32 ? Mips::MFLO : Mips::MFLO64);
+        return CurDAG->getMachineNode(Opc, dl, Ty, InFlag);
+      }
       else
         return CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
     }
@@ -417,31 +303,12 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
       if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
                                         Mips::ZERO, MVT::i32);
-        SDValue Undef = SDValue(
-          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::f64), 0);
-        SDNode *MTC = CurDAG->getMachineNode(Mips::MTC1, dl, MVT::f32, Zero);
-        SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::sub_fpeven, dl,
-                            MVT::f64, Undef, SDValue(MTC, 0));
-        SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::sub_fpodd, dl,
-                            MVT::f64, I0, SDValue(MTC, 0));
-        ReplaceUses(SDValue(Node, 0), I1);
-        return I1.getNode();
+        return CurDAG->getMachineNode(Mips::BuildPairF64, dl, MVT::f64, Zero,
+                                      Zero);
       }
       break;
     }
 
-    case ISD::LOAD:
-      if (SDNode *ResNode = SelectLoadFp64(Node))
-        return ResNode;
-      // Other cases are autogenerated.
-      break;
-
-    case ISD::STORE:
-      if (SDNode *ResNode = SelectStoreFp64(Node))
-        return ResNode;
-      // Other cases are autogenerated.
-      break;
-
     case MipsISD::ThreadPointer: {
       unsigned SrcReg = Mips::HWR29;
       unsigned DestReg = Mips::V1;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index b4f4b1b..1932e74 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -35,6 +35,18 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+// If I is a shifted mask, set the size (Size) and the first bit of the 
+// mask (Pos), and return true.
+// For example, if I is 0x003ff800, (Pos, Size) = (11, 11).  
+static bool IsShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
+  if (!isUInt<32>(I) || !isShiftedMask_32(I))
+     return false;
+
+  Size = CountPopulation_32(I);
+  Pos = CountTrailingZeros_32(I);
+  return true;
+}
+
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   case MipsISD::JmpLink:           return "MipsISD::JmpLink";
@@ -61,27 +73,38 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::ExtractElementF64: return "MipsISD::ExtractElementF64";
   case MipsISD::WrapperPIC:        return "MipsISD::WrapperPIC";
   case MipsISD::DynAlloc:          return "MipsISD::DynAlloc";
+  case MipsISD::Sync:              return "MipsISD::Sync";
+  case MipsISD::Ext:               return "MipsISD::Ext";
+  case MipsISD::Ins:               return "MipsISD::Ins";
   default:                         return NULL;
   }
 }
 
 MipsTargetLowering::
 MipsTargetLowering(MipsTargetMachine &TM)
-  : TargetLowering(TM, new MipsTargetObjectFile()) {
-  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+  : TargetLowering(TM, new MipsTargetObjectFile()),
+    Subtarget(&TM.getSubtarget<MipsSubtarget>()),
+    HasMips64(Subtarget->hasMips64()), IsN64(Subtarget->isABI_N64()) {
 
   // Mips does not have i1 type, so use i32 for
   // setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // Set up the register classes
   addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass);
   addRegisterClass(MVT::f32, Mips::FGR32RegisterClass);
 
+  if (HasMips64)
+    addRegisterClass(MVT::i64, Mips::CPU64RegsRegisterClass);
+
   // When dealing with single precision only, use libcalls
-  if (!Subtarget->isSingleFloat())
-    if (!Subtarget->isFP64bit())
+  if (!Subtarget->isSingleFloat()) {
+    if (HasMips64)
+      addRegisterClass(MVT::f64, Mips::FGR64RegisterClass);
+    else
       addRegisterClass(MVT::f64, Mips::AFGR64RegisterClass);
+  }
 
   // Load extented operations for i1 types must be promoted
   setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
@@ -100,6 +123,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   // Mips Custom Operations
   setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
   setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
   setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
@@ -115,6 +139,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIV, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
 
   // Operations not directly supported by Mips.
   setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
@@ -126,10 +154,14 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
   setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
   setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i64,   Expand);
 
-  if (!Subtarget->isMips32r2())
+  if (!Subtarget->hasMips32r2())
     setOperationAction(ISD::ROTR, MVT::i32,   Expand);
 
+  if (!Subtarget->hasMips64r2())
+    setOperationAction(ISD::ROTR, MVT::i64,   Expand);
+
   setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
   setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
@@ -159,7 +191,14 @@ MipsTargetLowering(MipsTargetMachine &TM)
   // Use the default for now
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
-  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Expand);
+
+  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE,      MVT::Other, Custom);  
+
+  setOperationAction(ISD::ATOMIC_LOAD,       MVT::i32,    Expand);  
+  setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);  
+
+  setInsertFencesForAtomic(true);
 
   if (Subtarget->isSingleFloat())
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
@@ -180,6 +219,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::SDIVREM);
   setTargetDAGCombine(ISD::UDIVREM);
   setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
 
   setMinFunctionAlignment(2);
 
@@ -190,7 +231,12 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setExceptionSelectorRegister(Mips::A1);
 }
 
-MVT::SimpleValueType MipsTargetLowering::getSetCCResultType(EVT VT) const {
+bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+  MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
+  return SVT == MVT::i64 || SVT == MVT::i32 || SVT == MVT::i16; 
+}
+
+EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -348,7 +394,7 @@ static SDValue PerformADDECombine(SDNode *N, SelectionDAG& DAG,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->isMips32() && SelectMadd(N, &DAG))
+  if (Subtarget->hasMips32() && SelectMadd(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
@@ -360,7 +406,7 @@ static SDValue PerformSUBECombine(SDNode *N, SelectionDAG& DAG,
   if (DCI.isBeforeLegalize())
     return SDValue();
 
-  if (Subtarget->isMips32() && SelectMsub(N, &DAG))
+  if (Subtarget->hasMips32() && SelectMsub(N, &DAG))
     return SDValue(N, 0);
 
   return SDValue();
@@ -372,6 +418,9 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  EVT Ty = N->getValueType(0);
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO : Mips::LO64; 
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64; 
   unsigned opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem :
                                                   MipsISD::DivRemU;
   DebugLoc dl = N->getDebugLoc();
@@ -383,7 +432,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
 
   // insert MFLO
   if (N->hasAnyUseOfValue(0)) {
-    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, dl, Mips::LO, MVT::i32,
+    SDValue CopyFromLo = DAG.getCopyFromReg(InChain, dl, LO, Ty,
                                             InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyFromLo);
     InChain = CopyFromLo.getValue(1);
@@ -393,7 +442,7 @@ static SDValue PerformDivRemCombine(SDNode *N, SelectionDAG& DAG,
   // insert MFHI
   if (N->hasAnyUseOfValue(1)) {
     SDValue CopyFromHi = DAG.getCopyFromReg(InChain, dl,
-                                            Mips::HI, MVT::i32, InGlue);
+                                            HI, Ty, InGlue);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), CopyFromHi);
   }
 
@@ -490,6 +539,101 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG& DAG,
   return CreateCMovFP(DAG, Cond, True, False, N->getDebugLoc());
 }
 
+static SDValue PerformANDCombine(SDNode *N, SelectionDAG& DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget* Subtarget) {
+  // Pattern match EXT.
+  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+  //  => ext $dst, $src, size, pos
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+    return SDValue();
+
+  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
+  
+  // Op's first operand must be a shift right.
+  if (ShiftRight.getOpcode() != ISD::SRA && ShiftRight.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  // The second operand of the shift must be an immediate.
+  uint64_t Pos;
+  ConstantSDNode *CN;
+  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
+    return SDValue();
+  
+  Pos = CN->getZExtValue();
+
+  uint64_t SMPos, SMSize;
+  // Op's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
+      !IsShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  if (SMPos != 0 || Pos + SMSize > 32)
+    return SDValue();
+
+  return DAG.getNode(MipsISD::Ext, N->getDebugLoc(), MVT::i32,
+                     ShiftRight.getOperand(0),
+                     DAG.getConstant(Pos, MVT::i32),
+                     DAG.getConstant(SMSize, MVT::i32));
+}
+  
+static SDValue PerformORCombine(SDNode *N, SelectionDAG& DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget* Subtarget) {
+  // Pattern match INS.
+  //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
+  //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1 
+  //  => ins $dst, $src, size, pos, $src1
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+    return SDValue();
+
+  SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
+  uint64_t SMPos0, SMSize0, SMPos1, SMSize1;
+  ConstantSDNode *CN;
+
+  // See if Op's first operand matches (and $src1 , mask0).
+  if (And0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(And0.getOperand(1))) ||
+      !IsShiftedMask(~CN->getSExtValue(), SMPos0, SMSize0))
+    return SDValue();
+
+  // See if Op's second operand matches (and (shl $src, pos), mask1).
+  if (And1.getOpcode() != ISD::AND)
+    return SDValue();
+  
+  if (!(CN = dyn_cast<ConstantSDNode>(And1.getOperand(1))) ||
+      !IsShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
+    return SDValue();
+
+  // The shift masks must have the same position and size.
+  if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+    return SDValue();
+
+  SDValue Shl = And1.getOperand(0);
+  if (Shl.getOpcode() != ISD::SHL)
+    return SDValue();
+
+  if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+    return SDValue();
+
+  unsigned Shamt = CN->getZExtValue();
+
+  // Return if the shift amount and the first bit position of mask are not the
+  // same.  
+  if (Shamt != SMPos0)
+    return SDValue();
+  
+  return DAG.getNode(MipsISD::Ins, N->getDebugLoc(), MVT::i32,
+                     Shl.getOperand(0),
+                     DAG.getConstant(SMPos0, MVT::i32),
+                     DAG.getConstant(SMSize0, MVT::i32),
+                     And0.getOperand(0));  
+}
+  
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -506,6 +650,10 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return PerformDivRemCombine(N, DAG, DCI, Subtarget);
   case ISD::SETCC:
     return PerformSETCCCombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    return PerformANDCombine(N, DAG, DCI, Subtarget);
+  case ISD::OR:
+    return PerformORCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -527,6 +675,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::VASTART:            return LowerVASTART(Op, DAG);
     case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
     case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+    case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, DAG);
+    case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
   }
   return SDValue();
 }
@@ -733,13 +883,13 @@ MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc dl = MI->getDebugLoc();
 
-  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned OldVal = MI->getOperand(0).getReg();
   unsigned Ptr = MI->getOperand(1).getReg();
   unsigned Incr = MI->getOperand(2).getReg();
 
-  unsigned Oldval = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -758,61 +908,38 @@ MipsTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   //  thisMBB:
   //    ...
-  //    sw incr, fi(sp)           // store incr to stack (when BinOpcode == 0)
   //    fallthrough --> loopMBB
-
-  // Note: for atomic.swap (when BinOpcode == 0), storing incr to stack before
-  // the loop and then loading it from stack in block loopMBB is necessary to
-  // prevent MachineLICM pass to hoist "or" instruction out of the block
-  // loopMBB.
-
-  int fi = 0;
-  if (BinOpcode == 0 && !Nand) {
-    // Get or create a temporary stack location.
-    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-    fi = MipsFI->getAtomicFrameIndex();
-    if (fi == -1) {
-      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
-      MipsFI->setAtomicFrameIndex(fi);
-    }
-
-    BuildMI(BB, dl, TII->get(Mips::SW))
-        .addReg(Incr).addFrameIndex(fi).addImm(0);
-  }
   BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(exitMBB);
 
   //  loopMBB:
   //    ll oldval, 0(ptr)
-  //    or dest, $0, oldval
-  //    <binop> tmp1, oldval, incr
-  //    sc tmp1, 0(ptr)
-  //    beq tmp1, $0, loopMBB
+  //    <binop> storeval, oldval, incr
+  //    sc success, storeval, 0(ptr)
+  //    beq success, $0, loopMBB
   BB = loopMBB;
-  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addReg(Ptr).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::OR), Dest).addReg(Mips::ZERO).addReg(Oldval);
+  BuildMI(BB, dl, TII->get(Mips::LL), OldVal).addReg(Ptr).addImm(0);
   if (Nand) {
-    //  and tmp2, oldval, incr
-    //  nor tmp1, $0, tmp2
-    BuildMI(BB, dl, TII->get(Mips::AND), Tmp2).addReg(Oldval).addReg(Incr);
-    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+    //  and andres, oldval, incr
+    //  nor storeval, $0, andres
+    BuildMI(BB, dl, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr);
+    BuildMI(BB, dl, TII->get(Mips::NOR), StoreVal)
+      .addReg(Mips::ZERO).addReg(AndRes);
   } else if (BinOpcode) {
-    //  <binop> tmp1, oldval, incr
-    BuildMI(BB, dl, TII->get(BinOpcode), Tmp1).addReg(Oldval).addReg(Incr);
+    //  <binop> storeval, oldval, incr
+    BuildMI(BB, dl, TII->get(BinOpcode), StoreVal).addReg(OldVal).addReg(Incr);
   } else {
-    //  lw tmp2, fi(sp)              // load incr from stack
-    //  or tmp1, $zero, tmp2
-    BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addFrameIndex(fi).addImm(0);
-    BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
+    StoreVal = Incr;
   }
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addReg(Ptr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+    .addReg(StoreVal).addReg(Ptr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loopMBB);
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 MachineBasicBlock *
@@ -833,33 +960,34 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
   unsigned Ptr = MI->getOperand(1).getReg();
   unsigned Incr = MI->getOperand(2).getReg();
 
-  unsigned Addr = RegInfo.createVirtualRegister(RC);
-  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned Newval = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval = RegInfo.createVirtualRegister(RC);
+  unsigned NewVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
   unsigned Incr2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp10 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp11 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp12 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned AndRes = RegInfo.createVirtualRegister(RC);
+  unsigned BinOpRes = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned SllRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = BB;
   ++It;
   MF->insert(It, loopMBB);
+  MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -868,111 +996,104 @@ MipsTargetLowering::EmitAtomicBinaryPartword(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
+  BB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(loopMBB);
+  loopMBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
   //  thisMBB:
-  //    addiu   tmp1,$0,-4                # 0xfffffffc
-  //    and     addr,ptr,tmp1
-  //    andi    tmp2,ptr,3
-  //    sll     shift,tmp2,3
-  //    ori     tmp3,$0,255               # 0xff
-  //    sll     mask,tmp3,shift
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
   //    nor     mask2,$0,mask
-  //    andi    tmp4,incr,255
-  //    sll     incr2,tmp4,shift
-  //    sw      incr2, fi(sp)      // store incr2 to stack (when BinOpcode == 0)
-
-  // Note: for atomic.swap (when BinOpcode == 0), storing incr2 to stack before
-  // the loop and then loading it from stack in block loopMBB is necessary to
-  // prevent MachineLICM pass to hoist "or" instruction out of the block
-  // loopMBB.
+  //    sll     incr2,incr,shiftamt
 
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), Mask)
+    .addReg(ShiftAmt).addReg(MaskUpper);
   BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
-  if (BinOpcode != Mips::SUBu) {
-    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Incr).addImm(MaskImm);
-    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp4).addReg(Shift);
-  } else {
-    BuildMI(BB, dl, TII->get(Mips::SUBu), Tmp4).addReg(Mips::ZERO).addReg(Incr);
-    BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Tmp4).addImm(MaskImm);
-    BuildMI(BB, dl, TII->get(Mips::SLL), Incr2).addReg(Tmp5).addReg(Shift);
-  }
+  BuildMI(BB, dl, TII->get(Mips::SLLV), Incr2).addReg(ShiftAmt).addReg(Incr);
 
-  int fi = 0;
-  if (BinOpcode == 0 && !Nand) {
-    // Get or create a temporary stack location.
-    MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-    fi = MipsFI->getAtomicFrameIndex();
-    if (fi == -1) {
-      fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
-      MipsFI->setAtomicFrameIndex(fi);
-    }
-
-    BuildMI(BB, dl, TII->get(Mips::SW))
-        .addReg(Incr2).addFrameIndex(fi).addImm(0);
-  }
-  BB->addSuccessor(loopMBB);
 
+  // atomic.load.binop
+  // loopMBB:
+  //   ll      oldval,0(alignedaddr)
+  //   binop   binopres,oldval,incr2
+  //   and     newval,binopres,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
+  // atomic.swap
   // loopMBB:
-  //   ll      oldval,0(addr)
-  //   binop   tmp7,oldval,incr2
-  //   and     newval,tmp7,mask
-  //   and     tmp8,oldval,mask2
-  //   or      tmp9,tmp8,newval
-  //   sc      tmp9,0(addr)
-  //   beq     tmp9,$0,loopMBB
+  //   ll      oldval,0(alignedaddr)
+  //   and     newval,incr2,mask
+  //   and     maskedoldval0,oldval,mask2
+  //   or      storeval,maskedoldval0,newval
+  //   sc      success,storeval,0(alignedaddr)
+  //   beq     success,$0,loopMBB
+
   BB = loopMBB;
-  BuildMI(BB, dl, TII->get(Mips::LL), Oldval).addReg(Addr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
-    //  and tmp6, oldval, incr2
-    //  nor tmp7, $0, tmp6
-    BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval).addReg(Incr2);
-    BuildMI(BB, dl, TII->get(Mips::NOR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
-  } else if (BinOpcode == Mips::SUBu) {
-    //  addu tmp7, oldval, incr2
-    BuildMI(BB, dl, TII->get(Mips::ADDu), Tmp7).addReg(Oldval).addReg(Incr2);
+    //  and andres, oldval, incr2
+    //  nor binopres, $0, andres
+    //  and newval, binopres, mask
+    BuildMI(BB, dl, TII->get(Mips::AND), AndRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, dl, TII->get(Mips::NOR), BinOpRes)
+      .addReg(Mips::ZERO).addReg(AndRes);
+    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
   } else if (BinOpcode) {
-    //  <binop> tmp7, oldval, incr2
-    BuildMI(BB, dl, TII->get(BinOpcode), Tmp7).addReg(Oldval).addReg(Incr2);
-  } else {
-    //  lw tmp6, fi(sp)              // load incr2 from stack
-    //  or tmp7, $zero, tmp6
-    BuildMI(BB, dl, TII->get(Mips::LW), Tmp6).addFrameIndex(fi).addImm(0);
-    BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Mips::ZERO).addReg(Tmp6);
+    //  <binop> binopres, oldval, incr2
+    //  and newval, binopres, mask
+    BuildMI(BB, dl, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
+    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
+  } else {// atomic.swap
+    //  and newval, incr2, mask
+    BuildMI(BB, dl, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
   }
-  BuildMI(BB, dl, TII->get(Mips::AND), Newval).addReg(Tmp7).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::AND), Tmp8).addReg(Oldval).addReg(Mask2);
-  BuildMI(BB, dl, TII->get(Mips::OR), Tmp9).addReg(Tmp8).addReg(Newval);
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp9).addReg(Tmp9).addReg(Addr).addImm(0);
+    
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal0).addReg(NewVal);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+    .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-      .addReg(Tmp9).addReg(Mips::ZERO).addMBB(loopMBB);
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //    and     tmp10,oldval,mask
-  //    srl     tmp11,tmp10,shift
-  //    sll     tmp12,tmp11,24
-  //    sra     dest,tmp12,24
-  BB = exitMBB;
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
+
+  //  sinkMBB:
+  //    and     maskedoldval1,oldval,mask
+  //    srl     srlres,maskedoldval1,shiftamt
+  //    sll     sllres,srlres,24
+  //    sra     dest,sllres,24
+  BB = sinkMBB;
   int64_t ShiftImm = (Size == 1) ? 24 : 16;
-  // reverse order
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
-      .addReg(Tmp12).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp12)
-      .addReg(Tmp11).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp11)
-      .addReg(Tmp10).addReg(Shift);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::AND), Tmp10)
-    .addReg(Oldval).addReg(Mask);
+
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::SRLV), SrlRes)
+      .addReg(ShiftAmt).addReg(MaskedOldVal1);
+  BuildMI(BB, dl, TII->get(Mips::SLL), SllRes)
+      .addReg(SrlRes).addImm(ShiftImm);
+  BuildMI(BB, dl, TII->get(Mips::SRA), Dest)
+      .addReg(SllRes).addImm(ShiftImm);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 MachineBasicBlock *
@@ -989,11 +1110,10 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
 
   unsigned Dest    = MI->getOperand(0).getReg();
   unsigned Ptr     = MI->getOperand(1).getReg();
-  unsigned Oldval  = MI->getOperand(2).getReg();
-  unsigned Newval  = MI->getOperand(3).getReg();
+  unsigned OldVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
 
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -1012,26 +1132,14 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  // Get or create a temporary stack location.
-  MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
-  int fi = MipsFI->getAtomicFrameIndex();
-  if (fi == -1) {
-    fi = MF->getFrameInfo()->CreateStackObject(Size, Size, false);
-    MipsFI->setAtomicFrameIndex(fi);
-  }
-
   //  thisMBB:
   //    ...
-  //    sw newval, fi(sp)           // store newval to stack
   //    fallthrough --> loop1MBB
-
-  // Note: storing newval to stack before the loop and then loading it from
-  // stack in block loop2MBB is necessary to prevent MachineLICM pass to
-  // hoist "or" instruction out of the block loop2MBB.
-
-  BuildMI(BB, dl, TII->get(Mips::SW))
-      .addReg(Newval).addFrameIndex(fi).addImm(0);
   BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(exitMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(exitMBB);
 
   // loop1MBB:
   //   ll dest, 0(ptr)
@@ -1039,27 +1147,20 @@ MipsTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   BB = loop1MBB;
   BuildMI(BB, dl, TII->get(Mips::LL), Dest).addReg(Ptr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BNE))
-    .addReg(Dest).addReg(Oldval).addMBB(exitMBB);
-  BB->addSuccessor(exitMBB);
-  BB->addSuccessor(loop2MBB);
+    .addReg(Dest).addReg(OldVal).addMBB(exitMBB);
 
   // loop2MBB:
-  //   lw tmp2, fi(sp)              // load newval from stack
-  //   or tmp1, $0, tmp2
-  //   sc tmp1, 0(ptr)
-  //   beq tmp1, $0, loop1MBB
+  //   sc success, newval, 0(ptr)
+  //   beq success, $0, loop1MBB
   BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(Mips::LW), Tmp2).addFrameIndex(fi).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::OR), Tmp1).addReg(Mips::ZERO).addReg(Tmp2);
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp1).addReg(Tmp1).addReg(Ptr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+    .addReg(NewVal).addReg(Ptr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-    .addReg(Tmp1).addReg(Mips::ZERO).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
+    .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 MachineBasicBlock *
@@ -1077,36 +1178,39 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
 
   unsigned Dest    = MI->getOperand(0).getReg();
   unsigned Ptr     = MI->getOperand(1).getReg();
-  unsigned Oldval  = MI->getOperand(2).getReg();
-  unsigned Newval  = MI->getOperand(3).getReg();
+  unsigned CmpVal  = MI->getOperand(2).getReg();
+  unsigned NewVal  = MI->getOperand(3).getReg();
 
-  unsigned Addr = RegInfo.createVirtualRegister(RC);
-  unsigned Shift = RegInfo.createVirtualRegister(RC);
+  unsigned AlignedAddr = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftAmt = RegInfo.createVirtualRegister(RC);
   unsigned Mask = RegInfo.createVirtualRegister(RC);
   unsigned Mask2 = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval2 = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval3 = RegInfo.createVirtualRegister(RC);
-  unsigned Oldval4 = RegInfo.createVirtualRegister(RC);
-  unsigned Newval2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp1 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp2 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp3 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp4 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp5 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp6 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp7 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp8 = RegInfo.createVirtualRegister(RC);
-  unsigned Tmp9 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned OldVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal0 = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned PtrLSB2 = RegInfo.createVirtualRegister(RC);
+  unsigned MaskUpper = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedCmpVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedNewVal = RegInfo.createVirtualRegister(RC);
+  unsigned MaskedOldVal1 = RegInfo.createVirtualRegister(RC);
+  unsigned StoreVal = RegInfo.createVirtualRegister(RC);
+  unsigned SrlRes = RegInfo.createVirtualRegister(RC);
+  unsigned SllRes = RegInfo.createVirtualRegister(RC);
+  unsigned Success = RegInfo.createVirtualRegister(RC);
 
   // insert new blocks after the current block
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineFunction::iterator It = BB;
   ++It;
   MF->insert(It, loop1MBB);
   MF->insert(It, loop2MBB);
+  MF->insert(It, sinkMBB);
   MF->insert(It, exitMBB);
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
@@ -1115,76 +1219,90 @@ MipsTargetLowering::EmitAtomicCmpSwapPartword(MachineInstr *MI,
                   BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
+  BB->addSuccessor(loop1MBB);
+  loop1MBB->addSuccessor(sinkMBB);
+  loop1MBB->addSuccessor(loop2MBB);
+  loop2MBB->addSuccessor(loop1MBB);
+  loop2MBB->addSuccessor(sinkMBB);
+  sinkMBB->addSuccessor(exitMBB);
+
+  // FIXME: computation of newval2 can be moved to loop2MBB.
   //  thisMBB:
-  //    addiu   tmp1,$0,-4                # 0xfffffffc
-  //    and     addr,ptr,tmp1
-  //    andi    tmp2,ptr,3
-  //    sll     shift,tmp2,3
-  //    ori     tmp3,$0,255               # 0xff
-  //    sll     mask,tmp3,shift
+  //    addiu   masklsb2,$0,-4                # 0xfffffffc
+  //    and     alignedaddr,ptr,masklsb2
+  //    andi    ptrlsb2,ptr,3
+  //    sll     shiftamt,ptrlsb2,3
+  //    ori     maskupper,$0,255               # 0xff
+  //    sll     mask,maskupper,shiftamt
   //    nor     mask2,$0,mask
-  //    andi    tmp4,oldval,255
-  //    sll     oldval2,tmp4,shift
-  //    andi    tmp5,newval,255
-  //    sll     newval2,tmp5,shift
+  //    andi    maskedcmpval,cmpval,255
+  //    sll     shiftedcmpval,maskedcmpval,shiftamt
+  //    andi    maskednewval,newval,255
+  //    sll     shiftednewval,maskednewval,shiftamt
   int64_t MaskImm = (Size == 1) ? 255 : 65535;
-  BuildMI(BB, dl, TII->get(Mips::ADDiu), Tmp1).addReg(Mips::ZERO).addImm(-4);
-  BuildMI(BB, dl, TII->get(Mips::AND), Addr).addReg(Ptr).addReg(Tmp1);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp2).addReg(Ptr).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Shift).addReg(Tmp2).addImm(3);
-  BuildMI(BB, dl, TII->get(Mips::ORi), Tmp3).addReg(Mips::ZERO).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Mask).addReg(Tmp3).addReg(Shift);
+  BuildMI(BB, dl, TII->get(Mips::ADDiu), MaskLSB2)
+    .addReg(Mips::ZERO).addImm(-4);
+  BuildMI(BB, dl, TII->get(Mips::AND), AlignedAddr)
+    .addReg(Ptr).addReg(MaskLSB2);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  BuildMI(BB, dl, TII->get(Mips::ORi), MaskUpper)
+    .addReg(Mips::ZERO).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), Mask)
+    .addReg(ShiftAmt).addReg(MaskUpper);
   BuildMI(BB, dl, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp4).addReg(Oldval).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Oldval2).addReg(Tmp4).addReg(Shift);
-  BuildMI(BB, dl, TII->get(Mips::ANDi), Tmp5).addReg(Newval).addImm(MaskImm);
-  BuildMI(BB, dl, TII->get(Mips::SLL), Newval2).addReg(Tmp5).addReg(Shift);
-  BB->addSuccessor(loop1MBB);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), MaskedCmpVal)
+    .addReg(CmpVal).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), ShiftedCmpVal)
+    .addReg(ShiftAmt).addReg(MaskedCmpVal);
+  BuildMI(BB, dl, TII->get(Mips::ANDi), MaskedNewVal)
+    .addReg(NewVal).addImm(MaskImm);
+  BuildMI(BB, dl, TII->get(Mips::SLLV), ShiftedNewVal)
+    .addReg(ShiftAmt).addReg(MaskedNewVal);
 
   //  loop1MBB:
-  //    ll      oldval3,0(addr)
-  //    and     oldval4,oldval3,mask
-  //    bne     oldval4,oldval2,exitMBB
+  //    ll      oldval,0(alginedaddr)
+  //    and     maskedoldval0,oldval,mask
+  //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(Mips::LL), Oldval3).addReg(Addr).addImm(0);
-  BuildMI(BB, dl, TII->get(Mips::AND), Oldval4).addReg(Oldval3).addReg(Mask);
+  BuildMI(BB, dl, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal0)
+    .addReg(OldVal).addReg(Mask);
   BuildMI(BB, dl, TII->get(Mips::BNE))
-      .addReg(Oldval4).addReg(Oldval2).addMBB(exitMBB);
-  BB->addSuccessor(exitMBB);
-  BB->addSuccessor(loop2MBB);
+    .addReg(MaskedOldVal0).addReg(ShiftedCmpVal).addMBB(sinkMBB);
 
   //  loop2MBB:
-  //    and     tmp6,oldval3,mask2
-  //    or      tmp7,tmp6,newval2
-  //    sc      tmp7,0(addr)
-  //    beq     tmp7,$0,loop1MBB
+  //    and     maskedoldval1,oldval,mask2
+  //    or      storeval,maskedoldval1,shiftednewval
+  //    sc      success,storeval,0(alignedaddr)
+  //    beq     success,$0,loop1MBB
   BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(Mips::AND), Tmp6).addReg(Oldval3).addReg(Mask2);
-  BuildMI(BB, dl, TII->get(Mips::OR), Tmp7).addReg(Tmp6).addReg(Newval2);
-  BuildMI(BB, dl, TII->get(Mips::SC), Tmp7)
-      .addReg(Tmp7).addReg(Addr).addImm(0);
+  BuildMI(BB, dl, TII->get(Mips::AND), MaskedOldVal1)
+    .addReg(OldVal).addReg(Mask2);
+  BuildMI(BB, dl, TII->get(Mips::OR), StoreVal)
+    .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
+  BuildMI(BB, dl, TII->get(Mips::SC), Success)
+      .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, dl, TII->get(Mips::BEQ))
-      .addReg(Tmp7).addReg(Mips::ZERO).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
+      .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
 
-  //  exitMBB:
-  //    srl     tmp8,oldval4,shift
-  //    sll     tmp9,tmp8,24
-  //    sra     dest,tmp9,24
-  BB = exitMBB;
+  //  sinkMBB:
+  //    srl     srlres,maskedoldval0,shiftamt
+  //    sll     sllres,srlres,24
+  //    sra     dest,sllres,24
+  BB = sinkMBB;
   int64_t ShiftImm = (Size == 1) ? 24 : 16;
-  // reverse order
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRA), Dest)
-      .addReg(Tmp9).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SLL), Tmp9)
-      .addReg(Tmp8).addImm(ShiftImm);
-  BuildMI(*BB, BB->begin(), dl, TII->get(Mips::SRL), Tmp8)
-      .addReg(Oldval4).addReg(Shift);
+
+  BuildMI(BB, dl, TII->get(Mips::SRLV), SrlRes)
+      .addReg(ShiftAmt).addReg(MaskedOldVal0);
+  BuildMI(BB, dl, TII->get(Mips::SLL), SllRes)
+      .addReg(SrlRes).addImm(ShiftImm);
+  BuildMI(BB, dl, TII->get(Mips::SRA), Dest)
+      .addReg(SllRes).addImm(ShiftImm);
 
   MI->eraseFromParent();   // The instruction is gone now.
 
-  return BB;
+  return exitMBB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1267,9 +1385,9 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 	
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     SDVTList VTs = DAG.getVTList(MVT::i32);
 
     MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering();
@@ -1292,21 +1410,26 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
     return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
   }
 
-  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                          MipsII::MO_GOT);
-  GA = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, GA);
-  SDValue ResNode = DAG.getLoad(MVT::i32, dl,
+  EVT ValTy = Op.getValueType();
+  bool HasGotOfst = (GV->hasInternalLinkage() ||
+                     (GV->hasLocalLinkage() && !isa<Function>(GV)));
+  unsigned GotFlag = IsN64 ?
+                     (HasGotOfst ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT_DISP) :
+                     MipsII::MO_GOT;
+  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, ValTy, 0, GotFlag);
+  GA = DAG.getNode(MipsISD::WrapperPIC, dl, ValTy, GA);
+  SDValue ResNode = DAG.getLoad(ValTy, dl,
                                 DAG.getEntryNode(), GA, MachinePointerInfo(),
                                 false, false, 0);
   // On functions and global targets not internal linked only
   // a load from got/GP is necessary for PIC to work.
-  if (!GV->hasInternalLinkage() &&
-      (!GV->hasLocalLinkage() || isa<Function>(GV)))
+  if (!HasGotOfst)
     return ResNode;
-  SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
-                                            MipsII::MO_ABS_LO);
-  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GALo);
-  return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
+  SDValue GALo = DAG.getTargetGlobalAddress(GV, dl, ValTy, 0,
+                                            IsN64 ? MipsII::MO_GOT_OFST :
+                                                    MipsII::MO_ABS_LO);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, ValTy, GALo);
+  return DAG.getNode(ISD::ADD, dl, ValTy, ResNode, Lo);
 }
 
 SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
@@ -1361,11 +1484,11 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     ArgListTy Args;
     ArgListEntry Entry;
     Entry.Node = Argument;
-    Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
+    Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
     Args.push_back(Entry);
     std::pair<SDValue, SDValue> CallResult =
         LowerCallTo(DAG.getEntryNode(),
-                    (const Type *) Type::getInt32Ty(*DAG.getContext()),
+                    (Type *) Type::getInt32Ty(*DAG.getContext()),
                     false, false, false, false, 0, CallingConv::C, false, true,
                     DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG,
                     dl);
@@ -1557,6 +1680,25 @@ LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+// TODO: set SType according to the desired memory barrier behavior.
+SDValue MipsTargetLowering::LowerMEMBARRIER(SDValue Op,
+                                            SelectionDAG& DAG) const {
+  unsigned SType = 0;
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, MVT::i32));
+}
+
+SDValue MipsTargetLowering::LowerATOMIC_FENCE(SDValue Op,
+                                              SelectionDAG& DAG) const {
+  // FIXME: Need pseudo-fence for 'singlethread' fences
+  // FIXME: Set SType for weaker fences where supported/appropriate.
+  unsigned SType = 0;
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(MipsISD::Sync, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(SType, MVT::i32));
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1679,55 +1821,109 @@ static const unsigned O32IntRegs[] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
 
+// Return next O32 integer argument register.
+static unsigned getNextIntArgReg(unsigned Reg) {
+  assert((Reg == Mips::A0) || (Reg == Mips::A2));
+  return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
+}
+
 // Write ByVal Arg to arg registers and stack.
 static void
-WriteByValArg(SDValue& Chain, DebugLoc dl,
+WriteByValArg(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
               SmallVector<std::pair<unsigned, SDValue>, 16>& RegsToPass,
               SmallVector<SDValue, 8>& MemOpChains, int& LastFI,
               MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
               const CCValAssign &VA, const ISD::ArgFlagsTy& Flags,
-              MVT PtrType) {
-  unsigned FirstWord = VA.getLocMemOffset() / 4;
-  unsigned NumWords = (Flags.getByValSize() + 3) / 4;
-  unsigned LastWord = FirstWord + NumWords;
-  unsigned CurWord;
-
-  // copy the first 4 words of byval arg to registers A0 - A3
-  for (CurWord = FirstWord; CurWord < std::min(LastWord, O32IntRegsSize);
-       ++CurWord) {
+              MVT PtrType, bool isLittle) {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  unsigned Offset = 0;
+  uint32_t RemainingSize = Flags.getByValSize();
+  unsigned ByValAlign = Flags.getByValAlign();
+
+  // Copy the first 4 words of byval arg to registers A0 - A3.
+  // FIXME: Use a stricter alignment if it enables better optimization in passes
+  //        run later.
+  for (; RemainingSize >= 4 && LocMemOffset < 4 * 4;
+       Offset += 4, RemainingSize -= 4, LocMemOffset += 4) {
     SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                                  DAG.getConstant((CurWord - FirstWord) * 4,
-                                                  MVT::i32));
+                                  DAG.getConstant(Offset, MVT::i32));
     SDValue LoadVal = DAG.getLoad(MVT::i32, dl, Chain, LoadPtr,
                                   MachinePointerInfo(),
-                                  false, false, 0);
+                                  false, false, std::min(ByValAlign,
+                                                         (unsigned )4));
     MemOpChains.push_back(LoadVal.getValue(1));
-    unsigned DstReg = O32IntRegs[CurWord];
+    unsigned DstReg = O32IntRegs[LocMemOffset / 4];
     RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
   }
 
-  // copy remaining part of byval arg to stack.
-  if (CurWord < LastWord) {
-    unsigned SizeInBytes = (LastWord - CurWord) * 4;
-    SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                              DAG.getConstant((CurWord - FirstWord) * 4,
-                                              MVT::i32));
-    LastFI = MFI->CreateFixedObject(SizeInBytes, CurWord * 4, true);
-    SDValue Dst = DAG.getFrameIndex(LastFI, PtrType);
-    Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
-                          DAG.getConstant(SizeInBytes, MVT::i32),
-                          /*Align*/4,
-                          /*isVolatile=*/false, /*AlwaysInline=*/false,
-                          MachinePointerInfo(0), MachinePointerInfo(0));
-    MemOpChains.push_back(Chain);
+  if (RemainingSize == 0)
+    return;
+
+  // If there still is a register available for argument passing, write the
+  // remaining part of the structure to it using subword loads and shifts.
+  if (LocMemOffset < 4 * 4) {
+    assert(RemainingSize <= 3 && RemainingSize >= 1 &&
+           "There must be one to three bytes remaining.");
+    unsigned LoadSize = (RemainingSize == 3 ? 2 : RemainingSize);
+    SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                                  DAG.getConstant(Offset, MVT::i32));
+    unsigned Alignment = std::min(ByValAlign, (unsigned )4);
+    SDValue LoadVal = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+                                     LoadPtr, MachinePointerInfo(),
+                                     MVT::getIntegerVT(LoadSize * 8), false,
+                                     false, Alignment);
+    MemOpChains.push_back(LoadVal.getValue(1));
+
+    // If target is big endian, shift it to the most significant half-word or
+    // byte.
+    if (!isLittle)
+      LoadVal = DAG.getNode(ISD::SHL, dl, MVT::i32, LoadVal,
+                            DAG.getConstant(32 - LoadSize * 8, MVT::i32));
+
+    Offset += LoadSize;
+    RemainingSize -= LoadSize;
+
+    // Read second subword if necessary.
+    if (RemainingSize != 0)  {
+      assert(RemainingSize == 1 && "There must be one byte remaining.");
+      LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg, 
+                            DAG.getConstant(Offset, MVT::i32));
+      unsigned Alignment = std::min(ByValAlign, (unsigned )2);
+      SDValue Subword = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+                                       LoadPtr, MachinePointerInfo(),
+                                       MVT::i8, false, false, Alignment);
+      MemOpChains.push_back(Subword.getValue(1));
+      // Insert the loaded byte to LoadVal.
+      // FIXME: Use INS if supported by target.
+      unsigned ShiftAmt = isLittle ? 16 : 8;
+      SDValue Shift = DAG.getNode(ISD::SHL, dl, MVT::i32, Subword,
+                                  DAG.getConstant(ShiftAmt, MVT::i32));
+      LoadVal = DAG.getNode(ISD::OR, dl, MVT::i32, LoadVal, Shift);
+    }
+
+    unsigned DstReg = O32IntRegs[LocMemOffset / 4];
+    RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
+    return;
   }
+
+  // Create a fixed object on stack at offset LocMemOffset and copy
+  // remaining part of byval arg to it using memcpy.
+  SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
+                            DAG.getConstant(Offset, MVT::i32));
+  LastFI = MFI->CreateFixedObject(RemainingSize, LocMemOffset, true);
+  SDValue Dst = DAG.getFrameIndex(LastFI, PtrType);
+  ByValChain = DAG.getMemcpy(ByValChain, dl, Dst, Src,
+                             DAG.getConstant(RemainingSize, MVT::i32),
+                             std::min(ByValAlign, (unsigned)4),
+                             /*isVolatile=*/false, /*AlwaysInline=*/false,
+                             MachinePointerInfo(0), MachinePointerInfo(0));
 }
 
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
 /// TODO: isTailCall.
 SDValue
-MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+MipsTargetLowering::LowerCall(SDValue InChain, SDValue Callee,
                               CallingConv::ID CallConv, bool isVarArg,
                               bool &isTailCall,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -1757,8 +1953,13 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NextStackOffset,
-                                                            true));
+  // Chain is the output chain of the last Load/Store or CopyToReg node.
+  // ByValChain is the output chain of the last Memcpy node created for copying
+  // byval arguments to the stack.
+  SDValue Chain, CallSeqStart, ByValChain;
+  SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
+  Chain = CallSeqStart = DAG.getCALLSEQ_START(InChain, NextStackOffsetVal);
+  ByValChain = InChain;
 
   // If this is the first call, create a stack frame object that points to
   // a location to which .cprestore saves $gp.
@@ -1818,8 +2019,10 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
                                    Arg, DAG.getConstant(1, MVT::i32));
           if (!Subtarget->isLittle())
             std::swap(Lo, Hi);
-          RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
-          RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi));
+          unsigned LocRegLo = VA.getLocReg(); 
+          unsigned LocRegHigh = getNextIntArgReg(LocRegLo);
+          RegsToPass.push_back(std::make_pair(LocRegLo, Lo));
+          RegsToPass.push_back(std::make_pair(LocRegHigh, Hi));
           continue;
         }
       }
@@ -1852,8 +2055,8 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
              "No support for ByVal args by ABIs other than O32 yet.");
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
-      WriteByValArg(Chain, dl, RegsToPass, MemOpChains, LastFI, MFI, DAG, Arg,
-                    VA, Flags, getPointerTy());
+      WriteByValArg(ByValChain, Chain, dl, RegsToPass, MemOpChains, LastFI, MFI,
+                    DAG, Arg, VA, Flags, getPointerTy(), Subtarget->isLittle());
       continue;
     }
 
@@ -1875,6 +2078,12 @@ MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
   if (LastFI)
     MipsFI->extendOutArgFIRange(FirstFI, LastFI);
 
+  // If a memcpy has been created to copy a byval arg to a stack, replace the
+  // chain input of CallSeqStart with ByValChain.
+  if (InChain != ByValChain)
+    DAG.UpdateNodeOperands(CallSeqStart.getNode(), ByValChain,
+                           NextStackOffsetVal);
+
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
@@ -2071,12 +2280,13 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       if (RegVT == MVT::i32)
         RC = Mips::CPURegsRegisterClass;
+      else if (RegVT == MVT::i64)
+        RC = Mips::CPU64RegsRegisterClass;
       else if (RegVT == MVT::f32)
         RC = Mips::FGR32RegisterClass;
-      else if (RegVT == MVT::f64) {
-        if (!Subtarget->isSingleFloat())
-          RC = Mips::AFGR64RegisterClass;
-      } else
+      else if (RegVT == MVT::f64)
+        RC = HasMips64 ? Mips::FGR64RegisterClass : Mips::AFGR64RegisterClass;
+      else
         llvm_unreachable("RegVT not supported by FormalArguments Lowering");
 
       // Transform the arguments stored on
@@ -2105,7 +2315,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
           ArgValue = DAG.getNode(ISD::BITCAST, dl, MVT::f32, ArgValue);
         if (RegVT == MVT::i32 && VA.getValVT() == MVT::f64) {
           unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(),
-                                    VA.getLocReg()+1, RC);
+                                    getNextIntArgReg(ArgReg), RC);
           SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT);
           if (!Subtarget->isLittle())
             std::swap(ArgValue, ArgValue2);
@@ -2313,7 +2523,7 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index bda26a2..4be3fed5 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -81,7 +81,12 @@ namespace llvm {
 
       WrapperPIC,
 
-      DynAlloc
+      DynAlloc,
+
+      Sync,
+
+      Ext,
+      Ins
     };
   }
 
@@ -93,6 +98,8 @@ namespace llvm {
   public:
     explicit MipsTargetLowering(MipsTargetMachine &TM);
 
+    virtual bool allowsUnalignedMemoryAccesses (EVT VT) const;
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
@@ -101,13 +108,14 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    EVT getSetCCResultType(EVT VT) const;
 
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
     // Subtarget Info
     const MipsSubtarget *Subtarget;
-
+    
+    bool HasMips64, IsN64;
 
     // Lower Operand helpers
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
@@ -128,6 +136,8 @@ namespace llvm {
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG& DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 021c167..2fb9d18 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -27,7 +27,7 @@
 def SDT_MipsFPBrcond : SDTypeProfile<0, 2, [SDTCisInt<0>,
                                             SDTCisVT<1, OtherVT>]>;
 def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
-                                         SDTCisInt<2>]>;
+                                         SDTCisVT<2, i32>]>;
 def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisSameAs<1, 2>]>;
 def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
@@ -35,12 +35,11 @@ def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
                                                 SDTCisSameAs<1, 2>]>;
 def SDT_MipsExtractElementF64 : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                                      SDTCisVT<1, f64>,
-                                                     SDTCisVT<0, i32>]>;
+                                                     SDTCisVT<2, i32>]>;
 
 def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp, [SDNPOutGlue]>;
 def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
-def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInGlue]>;
 def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
                           [SDNPHasChain, SDNPOptInGlue]>;
 def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
@@ -55,10 +54,10 @@ let PrintMethod = "printFCCOperand" in
 // Feature predicates.
 //===----------------------------------------------------------------------===//
 
-def In32BitMode      : Predicate<"!Subtarget.isFP64bit()">;
+def IsFP64bit        : Predicate<"Subtarget.isFP64bit()">;
+def NotFP64bit       : Predicate<"!Subtarget.isFP64bit()">;
 def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">;
 def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">;
-def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
@@ -74,97 +73,87 @@ def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
 // Only S32 and D32 are supported right now.
 //===----------------------------------------------------------------------===//
 
-multiclass FFR1_1<bits<6> funct, string asmstr>
-{
-  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-      !strconcat(asmstr, ".s\t$fd, $fs"), []>;
-
-  def _D32  : FFR<0x11, funct, 0x1, (outs FGR32:$fd), (ins AFGR64:$fs),
-      !strconcat(asmstr, ".d\t$fd, $fs"), []>, Requires<[In32BitMode]>;
+// FP load.
+class FPLoad<bits<6> op, string opstr, PatFrag FOp, RegisterClass RC,
+             Operand MemOpnd>:
+  FFI<op, (outs RC:$ft), (ins MemOpnd:$base),
+      !strconcat(opstr, "\t$ft, $base"), [(set RC:$ft, (FOp addr:$base))]>;
+
+// FP store.
+class FPStore<bits<6> op, string opstr, PatFrag FOp, RegisterClass RC,
+              Operand MemOpnd>:
+  FFI<op, (outs), (ins RC:$ft, MemOpnd:$base),
+      !strconcat(opstr, "\t$ft, $base"), [(store RC:$ft, addr:$base)]>;
+
+// Instructions that convert an FP value to 32-bit fixed point.
+multiclass FFR1_W_M<bits<6> funct, string opstr> {
+  def _S   : FFR1<funct, 16, opstr, "w.s", FGR32, FGR32>;
+  def _D32 : FFR1<funct, 17, opstr, "w.d", FGR32, AFGR64>,
+             Requires<[NotFP64bit]>;
+  def _D64 : FFR1<funct, 17, opstr, "w.d", FGR32, FGR64>,
+             Requires<[IsFP64bit]>;
 }
 
-multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp>
-{
-  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                 !strconcat(asmstr, ".s\t$fd, $fs"),
-                 [(set FGR32:$fd, (FOp FGR32:$fs))]>;
-
-  def _D32  : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                 !strconcat(asmstr, ".d\t$fd, $fs"),
-                 [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>;
+// Instructions that convert an FP value to 64-bit fixed point.
+let Predicates = [IsFP64bit] in
+multiclass FFR1_L_M<bits<6> funct, string opstr> {
+  def _S   : FFR1<funct, 16, opstr, "l.s", FGR64, FGR32>;
+  def _D64 : FFR1<funct, 17, opstr, "l.d", FGR64, FGR64>;
 }
 
-class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc,
-              RegisterClass RcDst, string asmstr>:
-  FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs),
-      !strconcat(asmstr, "\t$fd, $fs"), []>;
-
+// FP-to-FP conversion instructions.
+multiclass FFR1P_M<bits<6> funct, string opstr, SDNode OpNode> {
+  def _S   : FFR1P<funct, 16, opstr, "s", FGR32, FGR32, OpNode>;
+  def _D32 : FFR1P<funct, 17, opstr, "d", AFGR64, AFGR64, OpNode>,
+             Requires<[NotFP64bit]>;
+  def _D64 : FFR1P<funct, 17, opstr, "d", FGR64, FGR64, OpNode>,
+             Requires<[IsFP64bit]>;
+}
 
-multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp, bit isComm = 0> {
+multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> {
   let isCommutable = isComm in {
-  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd),
-                 (ins FGR32:$fs, FGR32:$ft),
-                 !strconcat(asmstr, ".s\t$fd, $fs, $ft"),
-                 [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>;
-
-  def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd),
-                 (ins AFGR64:$fs, AFGR64:$ft),
-                 !strconcat(asmstr, ".d\t$fd, $fs, $ft"),
-                 [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>,
-                 Requires<[In32BitMode]>;
+  def _S   : FFR2P<funct, 16, opstr, "s", FGR32, OpNode>;
+  def _D32 : FFR2P<funct, 17, opstr, "d", AFGR64, OpNode>,
+             Requires<[NotFP64bit]>;
+  def _D64 : FFR2P<funct, 17, opstr, "d", FGR64, OpNode>,
+             Requires<[IsFP64bit]>;
   }
 }
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
+defm ROUND_W : FFR1_W_M<0xc, "round">;
+defm ROUND_L : FFR1_L_M<0x8, "round">;
+defm TRUNC_W : FFR1_W_M<0xd, "trunc">;
+defm TRUNC_L : FFR1_L_M<0x9, "trunc">;
+defm CEIL_W  : FFR1_W_M<0xe, "ceil">;
+defm CEIL_L  : FFR1_L_M<0xa, "ceil">;
+defm FLOOR_W : FFR1_W_M<0xf, "floor">;
+defm FLOOR_L : FFR1_L_M<0xb, "floor">;
+defm CVT_W   : FFR1_W_M<0x24, "cvt">;
+defm CVT_L   : FFR1_L_M<0x25, "cvt">;
+
+def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>;
+
+let Predicates = [NotFP64bit] in {
+  def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>;
+  def CVT_D32_W : FFR1<0x21, 20, "cvt", "d.w", AFGR64, FGR32>;
+  def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>;
+}
 
-let ft = 0 in {
-  defm FLOOR_W : FFR1_1<0b001111, "floor.w">;
-  defm CEIL_W  : FFR1_1<0b001110, "ceil.w">;
-  defm ROUND_W : FFR1_1<0b001100, "round.w">;
-  defm TRUNC_W : FFR1_1<0b001101, "trunc.w">;
-  defm CVTW    : FFR1_1<0b100100, "cvt.w">;
-
-  defm FABS    : FFR1_2<0b000101, "abs",  fabs>;
-  defm FNEG    : FFR1_2<0b000111, "neg",  fneg>;
-  defm FSQRT   : FFR1_2<0b000100, "sqrt", fsqrt>;
-
-  /// Convert to Single Precison
-  def CVTS_W32 : FFR1_3<0b100000, 0x2, FGR32,  FGR32,  "cvt.s.w">;
-
-  let Predicates = [IsNotSingleFloat] in {
-    /// Ceil to long signed integer
-    def CEIL_LS   : FFR1_3<0b001010, 0x0, FGR32, FGR32, "ceil.l">;
-    def CEIL_LD   : FFR1_3<0b001010, 0x1, AFGR64, AFGR64, "ceil.l">;
-
-    /// Round to long signed integer
-    def ROUND_LS  : FFR1_3<0b001000, 0x0, FGR32, FGR32, "round.l">;
-    def ROUND_LD  : FFR1_3<0b001000, 0x1, AFGR64, AFGR64, "round.l">;
-
-    /// Floor to long signed integer
-    def FLOOR_LS  : FFR1_3<0b001011, 0x0, FGR32, FGR32, "floor.l">;
-    def FLOOR_LD  : FFR1_3<0b001011, 0x1, AFGR64, AFGR64, "floor.l">;
-
-    /// Trunc to long signed integer
-    def TRUNC_LS  : FFR1_3<0b001001, 0x0, FGR32, FGR32, "trunc.l">;
-    def TRUNC_LD  : FFR1_3<0b001001, 0x1, AFGR64, AFGR64, "trunc.l">;
-
-    /// Convert to long signed integer
-    def CVTL_S    : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">;
-    def CVTL_D    : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">;
-
-    /// Convert to Double Precison
-    def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">;
-    def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">;
-    def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">;
-
-    /// Convert to Single Precison
-    def CVTS_D32 : FFR1_3<0b100000, 0x1, FGR32, AFGR64, "cvt.s.d">;
-    def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">;
-  }
+let Predicates = [IsFP64bit] in {
+ def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>;
+ def CVT_S_L   : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>;
+ def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>;
+ def CVT_D64_S : FFR1<0x21, 16, "cvt", "d.s", FGR64, FGR32>;
+ def CVT_D64_L : FFR1<0x21, 21, "cvt", "d.l", FGR64, FGR64>;
 }
 
+defm FABS    : FFR1P_M<0x5, "abs",  fabs>;
+defm FNEG    : FFR1P_M<0x7, "neg",  fneg>;
+defm FSQRT   : FFR1P_M<0x4, "sqrt", fsqrt>;
+
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
 // When defining instructions, we reference all 32-bit registers,
@@ -178,37 +167,46 @@ let fd = 0 in {
                   "ctc1\t$fs, $rt", []>;
 
   def MFC1  : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
-                  "mfc1\t$rt, $fs", []>;
+                  "mfc1\t$rt, $fs",
+                  [(set CPURegs:$rt, (bitconvert FGR32:$fs))]>;
 
   def MTC1  : FFR<0x11, 0x00, 0x04, (outs FGR32:$fs), (ins CPURegs:$rt),
-                  "mtc1\t$rt, $fs", []>;
+                  "mtc1\t$rt, $fs",
+                  [(set FGR32:$fs, (bitconvert CPURegs:$rt))]>;
 }
 
-def FMOV_S32 : FFR<0x11, 0b000110, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
-                   "mov.s\t$fd, $fs", []>;
-def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
-                   "mov.d\t$fd, $fs", []>;
+def FMOV_S   : FFR1<0x6, 16, "mov", "s", FGR32, FGR32>;
+def FMOV_D32 : FFR1<0x6, 17, "mov", "d", AFGR64, AFGR64>,
+               Requires<[NotFP64bit]>;
+def FMOV_D64 : FFR1<0x6, 17, "mov", "d", FGR64, FGR64>,
+               Requires<[IsFP64bit]>;
 
 /// Floating Point Memory Instructions
-let Predicates = [IsNotSingleFloat, IsNotMipsI] in {
-  def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr),
-                 "ldc1\t$ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
-
-  def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr),
-                 "sdc1\t$ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
+let Predicates = [IsN64] in {
+  def LWC1_P8   : FPLoad<0x31, "lwc1", load, FGR32, mem64>;
+  def SWC1_P8   : FPStore<0x39, "swc1", store, FGR32, mem64>;
+  def LDC164_P8 : FPLoad<0x35, "ldc1", load, FGR64, mem64>;
+  def SDC164_P8 : FPStore<0x3d, "sdc1", store, FGR64, mem64>;
 }
 
-// LWC1 and SWC1 can always be emitted with odd registers.
-def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1\t$ft, $addr",
-               [(set FGR32:$ft, (load addr:$addr))]>;
-def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr),
-               "swc1\t$ft, $addr", [(store FGR32:$ft, addr:$addr)]>;
+let Predicates = [NotN64] in {
+  def LWC1   : FPLoad<0x31, "lwc1", load, FGR32, mem>;
+  def SWC1   : FPStore<0x39, "swc1", store, FGR32, mem>;
+  let Predicates = [HasMips64] in {
+    def LDC164 : FPLoad<0x35, "ldc1", load, FGR64, mem>;
+    def SDC164 : FPStore<0x3d, "sdc1", store, FGR64, mem>;
+  }
+  let Predicates = [NotMips64] in {
+    def LDC1   : FPLoad<0x35, "ldc1", load, AFGR64, mem>;
+    def SDC1   : FPStore<0x3d, "sdc1", store, AFGR64, mem>;
+  }
+}
 
 /// Floating-point Aritmetic
-defm FADD : FFR1_4<0x10, "add", fadd, 1>;
-defm FDIV : FFR1_4<0x03, "div", fdiv>;
-defm FMUL : FFR1_4<0x02, "mul", fmul, 1>;
-defm FSUB : FFR1_4<0x01, "sub", fsub>;
+defm FADD : FFR2P_M<0x10, "add", fadd, 1>;
+defm FDIV : FFR2P_M<0x03, "div", fdiv>;
+defm FMUL : FFR2P_M<0x02, "mul", fmul, 1>;
+defm FSUB : FFR2P_M<0x01, "sub", fsub>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Branch Codes
@@ -217,8 +215,6 @@ defm FSUB : FFR1_4<0x01, "sub", fsub>;
 // They must be kept in synch.
 def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
-def MIPS_BRANCH_FL : PatLeaf<(i32 2)>;
-def MIPS_BRANCH_TL : PatLeaf<(i32 3)>;
 
 /// Floating Point Branch of False/True (Likely)
 let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
@@ -228,8 +224,6 @@ let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
 
 def BC1F  : FBRANCH<MIPS_BRANCH_F,  "bc1f">;
 def BC1T  : FBRANCH<MIPS_BRANCH_T,  "bc1t">;
-def BC1FL : FBRANCH<MIPS_BRANCH_FL, "bc1fl">;
-def BC1TL : FBRANCH<MIPS_BRANCH_TL, "bc1tl">;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
@@ -254,7 +248,7 @@ def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
 def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 
 /// Floating Point Compare
-let hasDelaySlot = 1, Defs=[FCR31] in {
+let Defs=[FCR31] in {
   def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
                      "c.$cc.s\t$fs, $ft",
                      [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc)]>;
@@ -262,7 +256,7 @@ let hasDelaySlot = 1, Defs=[FCR31] in {
   def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
                      "c.$cc.d\t$fs, $ft",
                      [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc)]>,
-                     Requires<[In32BitMode]>;
+                     Requires<[NotFP64bit]>;
 }
 
 
@@ -280,7 +274,7 @@ class CondMovIntFP<RegisterClass RC, bits<5> fmt, bits<6> func,
 def MOVZ_S : CondMovIntFP<FGR32, 16, 18, "movz.s">;
 def MOVN_S : CondMovIntFP<FGR32, 16, 19, "movn.s">;
 
-let Predicates = [In32BitMode] in {
+let Predicates = [NotFP64bit] in {
   def MOVZ_D : CondMovIntFP<AFGR64, 17, 18, "movz.d">;
   def MOVN_D : CondMovIntFP<AFGR64, 17, 19, "movn.d">;
 }
@@ -288,7 +282,7 @@ let Predicates = [In32BitMode] in {
 defm : MovzPats<FGR32, MOVZ_S>;
 defm : MovnPats<FGR32, MOVN_S>;
 
-let Predicates = [In32BitMode] in {
+let Predicates = [NotFP64bit] in {
   defm : MovzPats<AFGR64, MOVZ_D>;
   defm : MovnPats<AFGR64, MOVN_D>;
 }
@@ -313,7 +307,7 @@ def MOVF : CondMovFPInt<MipsCMovFP_F, 0, "movf">;
 def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">;
 def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">;
 
-let Predicates = [In32BitMode] in {
+let Predicates = [NotFP64bit] in {
   def MOVT_D : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">;
   def MOVF_D : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">;
 }
@@ -353,22 +347,16 @@ def fpimm0neg : PatLeaf<(fpimm), [{
 }]>;
 
 def : Pat<(f32 fpimm0), (MTC1 ZERO)>;
-def : Pat<(f32 fpimm0neg), (FNEG_S32 (MTC1 ZERO))>;
+def : Pat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
 
-def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVTS_W32 (MTC1 CPURegs:$src))>;
-def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVTD_W32 (MTC1 CPURegs:$src))>;
+def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
+def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVT_D32_W (MTC1 CPURegs:$src))>;
 
-def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>;
+def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
 def : Pat<(i32 (fp_to_sint AFGR64:$src)), (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
 
-def : Pat<(i32 (bitconvert FGR32:$src)),  (MFC1 FGR32:$src)>;
-def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>;
-
-let Predicates = [In32BitMode] in {
-  def : Pat<(f32 (fround AFGR64:$src)), (CVTS_D32 AFGR64:$src)>;
-  def : Pat<(f64 (fextend FGR32:$src)), (CVTD_S32 FGR32:$src)>;
+let Predicates = [NotFP64bit] in {
+  def : Pat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
+  def : Pat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
 }
 
-// MipsFPRound is only emitted for MipsI targets.
-def : Pat<(f32 (MipsFPRound AFGR64:$src)), (CVTW_D32 AFGR64:$src)>;
-
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 9f55fb3..d246a26 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -44,7 +44,9 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
 
 // Mips Pseudo Instructions Format
 class MipsPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
-      MipsInst<outs, ins, asmstr, pattern, IIPseudo>;
+      MipsInst<outs, ins, asmstr, pattern, IIPseudo> {
+  let isPseudo = 1;
+}
 
 //===----------------------------------------------------------------------===//
 // Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
@@ -88,6 +90,21 @@ class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
   let Inst{15-0}  = imm16;
 }
 
+class CBranchBase<bits<6> op, dag outs, dag ins, string asmstr,
+                  list<dag> pattern, InstrItinClass itin>:
+  MipsInst<outs, ins, asmstr, pattern, itin>
+{
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = imm16;
+}
+
 //===----------------------------------------------------------------------===//
 // Format J instruction class in Mips : <|opcode|address|>
 //===----------------------------------------------------------------------===//
@@ -224,4 +241,27 @@ class FFCMOV<bits<5> _fmt, bits<1> _tf, dag outs, dag ins, string asmstr,
   let Inst{15-11} = fs;
   let Inst{10-6}  = fd;
   let Inst{5-0}   = 17;
-}
-\ No newline at end of file
+}
+
+// FP unary instructions without patterns.
+class FFR1<bits<6> funct, bits<5> fmt, string opstr, string fmtstr,
+           RegisterClass DstRC, RegisterClass SrcRC> :
+  FFR<0x11, funct, fmt, (outs DstRC:$fd), (ins SrcRC:$fs),
+      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs"), []> {
+  let ft = 0;
+}
+
+// FP unary instructions with patterns.
+class FFR1P<bits<6> funct, bits<5> fmt, string opstr, string fmtstr,
+            RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode> :
+  FFR<0x11, funct, fmt, (outs DstRC:$fd), (ins SrcRC:$fs),
+      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs"),
+      [(set DstRC:$fd, (OpNode SrcRC:$fs))]> {
+  let ft = 0;
+}
+
+class FFR2P<bits<6> funct, bits<5> fmt, string opstr,
+            string fmtstr, RegisterClass RC, SDNode OpNode> :
+  FFR<0x11, funct, fmt, (outs RC:$fd), (ins RC:$fs, RC:$ft),
+      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs, $ft"),
+      [(set RC:$fd, (OpNode RC:$fs, RC:$ft))]>;
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 0a7a7f2..559943a 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -17,8 +17,8 @@
 #include "InstPrinter/MipsInstPrinter.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 
 #define GET_INSTRINFO_CTOR
@@ -28,7 +28,8 @@ using namespace llvm;
 
 MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+    TM(tm), IsN64(TM.getSubtarget<MipsSubtarget>().isABI_N64()),
+    RI(*TM.getSubtargetImpl(), *this) {}
 
 
 const MipsRegisterInfo &MipsInstrInfo::getRegisterInfo() const { 
@@ -47,8 +48,12 @@ static bool isZeroImm(const MachineOperand &op) {
 unsigned MipsInstrInfo::
 isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
-  if ((MI->getOpcode() == Mips::LW) || (MI->getOpcode() == Mips::LWC1) ||
-      (MI->getOpcode() == Mips::LDC1)) {
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
+      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
+      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
+      (Opc == Mips::LDC164_P8)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -68,8 +73,12 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 unsigned MipsInstrInfo::
 isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
-  if ((MI->getOpcode() == Mips::SW) || (MI->getOpcode() == Mips::SWC1) ||
-      (MI->getOpcode() == Mips::SDC1)) {
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
+      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
+      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
+      (Opc == Mips::SDC164_P8)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -94,70 +103,63 @@ copyPhysReg(MachineBasicBlock &MBB,
             MachineBasicBlock::iterator I, DebugLoc DL,
             unsigned DestReg, unsigned SrcReg,
             bool KillSrc) const {
-  bool DestCPU = Mips::CPURegsRegClass.contains(DestReg);
-  bool SrcCPU  = Mips::CPURegsRegClass.contains(SrcReg);
-
-  // CPU-CPU is the most common.
-  if (DestCPU && SrcCPU) {
-    BuildMI(MBB, I, DL, get(Mips::ADDu), DestReg).addReg(Mips::ZERO)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
+  unsigned Opc = 0, ZeroReg = 0;
 
-  // Copy to CPU from other registers.
-  if (DestCPU) {
-    if (Mips::CCRRegClass.contains(SrcReg))
-      BuildMI(MBB, I, DL, get(Mips::CFC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::CPURegsRegClass.contains(SrcReg))
+      Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
+    else if (Mips::CCRRegClass.contains(SrcReg))
+      Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
-      BuildMI(MBB, I, DL, get(Mips::MFC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::MFC1;
     else if (SrcReg == Mips::HI)
-      BuildMI(MBB, I, DL, get(Mips::MFHI), DestReg);
+      Opc = Mips::MFHI, SrcReg = 0;
     else if (SrcReg == Mips::LO)
-      BuildMI(MBB, I, DL, get(Mips::MFLO), DestReg);
-    else
-      llvm_unreachable("Copy to CPU from invalid register");
-    return;
+      Opc = Mips::MFLO, SrcReg = 0;
   }
-
-  // Copy to other registers from CPU.
-  if (SrcCPU) {
+  else if (Mips::CPURegsRegClass.contains(SrcReg)) { // Copy from CPU Reg.
     if (Mips::CCRRegClass.contains(DestReg))
-      BuildMI(MBB, I, DL, get(Mips::CTC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::CTC1;
     else if (Mips::FGR32RegClass.contains(DestReg))
-      BuildMI(MBB, I, DL, get(Mips::MTC1), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::MTC1;
     else if (DestReg == Mips::HI)
-      BuildMI(MBB, I, DL, get(Mips::MTHI))
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      Opc = Mips::MTHI, DestReg = 0;
     else if (DestReg == Mips::LO)
-      BuildMI(MBB, I, DL, get(Mips::MTLO))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    else
-      llvm_unreachable("Copy from CPU to invalid register");
-    return;
+      Opc = Mips::MTLO, DestReg = 0;
   }
-
-  if (Mips::FGR32RegClass.contains(DestReg, SrcReg)) {
-    BuildMI(MBB, I, DL, get(Mips::FMOV_S32), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
+  else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_S;
+  else if (Mips::AFGR64RegClass.contains(DestReg, SrcReg))
+    Opc = Mips::FMOV_D32;
+  else if (Mips::CCRRegClass.contains(DestReg, SrcReg))
+    Opc = Mips::MOVCCRToCCR;
+  else if (Mips::CPU64RegsRegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+    if (Mips::CPU64RegsRegClass.contains(SrcReg))
+      Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
+    else if (SrcReg == Mips::HI64)
+      Opc = Mips::MFHI64, SrcReg = 0;
+    else if (SrcReg == Mips::LO64)
+      Opc = Mips::MFLO64, SrcReg = 0;
   }
-
-  if (Mips::AFGR64RegClass.contains(DestReg, SrcReg)) {
-    BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
+  else if (Mips::CPU64RegsRegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+    if (DestReg == Mips::HI64)
+      Opc = Mips::MTHI64, DestReg = 0;
+    else if (DestReg == Mips::LO64)
+      Opc = Mips::MTLO64, DestReg = 0;
   }
 
-  if (Mips::CCRRegClass.contains(DestReg, SrcReg)) {
-    BuildMI(MBB, I, DL, get(Mips::MOVCCRToCCR), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  llvm_unreachable("Cannot copy registers");
+  assert(Opc && "Cannot copy registers");
+
+  MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc));
+  
+  if (DestReg)
+    MIB.addReg(DestReg, RegState::Define);
+
+  if (ZeroReg)
+    MIB.addReg(ZeroReg);
+
+  if (SrcReg)
+    MIB.addReg(SrcReg, getKillRegState(KillSrc));
 }
 
 void MipsInstrInfo::
@@ -167,31 +169,22 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
+  unsigned Opc = 0;
 
   if (RC == Mips::CPURegsRegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::SW)).addReg(SrcReg, getKillRegState(isKill))
-                                      .addFrameIndex(FI).addImm(0);
+    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
+  else if (RC == Mips::CPU64RegsRegisterClass)
+    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
   else if (RC == Mips::FGR32RegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::SWC1)).addReg(SrcReg, getKillRegState(isKill))
-                                        .addFrameIndex(FI).addImm(0);
-  else if (RC == Mips::AFGR64RegisterClass) {
-    if (!TM.getSubtarget<MipsSubtarget>().isMips1()) {
-      BuildMI(MBB, I, DL, get(Mips::SDC1))
-        .addReg(SrcReg, getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(0);
-    } else {
-      const TargetRegisterInfo *TRI =
-        MBB.getParent()->getTarget().getRegisterInfo();
-      const unsigned *SubSet = TRI->getSubRegisters(SrcReg);
-      BuildMI(MBB, I, DL, get(Mips::SWC1))
-        .addReg(SubSet[0], getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(0);
-      BuildMI(MBB, I, DL, get(Mips::SWC1))
-        .addReg(SubSet[1], getKillRegState(isKill))
-        .addFrameIndex(FI).addImm(4);
-    }
-  } else
-    llvm_unreachable("Register class not handled!");
+    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
+  else if (RC == Mips::AFGR64RegisterClass)
+    Opc = Mips::SDC1;
+  else if (RC == Mips::FGR64RegisterClass)
+    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(0);
 }
 
 void MipsInstrInfo::
@@ -202,25 +195,21 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
+  unsigned Opc = 0;
 
   if (RC == Mips::CPURegsRegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::LW), DestReg).addFrameIndex(FI).addImm(0);
+    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
+  else if (RC == Mips::CPU64RegsRegisterClass)
+    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
   else if (RC == Mips::FGR32RegisterClass)
-    BuildMI(MBB, I, DL, get(Mips::LWC1), DestReg).addFrameIndex(FI).addImm(0);
-  else if (RC == Mips::AFGR64RegisterClass) {
-    if (!TM.getSubtarget<MipsSubtarget>().isMips1()) {
-      BuildMI(MBB, I, DL, get(Mips::LDC1), DestReg).addFrameIndex(FI).addImm(0);
-    } else {
-      const TargetRegisterInfo *TRI =
-        MBB.getParent()->getTarget().getRegisterInfo();
-      const unsigned *SubSet = TRI->getSubRegisters(DestReg);
-      BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[0])
-        .addFrameIndex(FI).addImm(0);
-      BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[1])
-        .addFrameIndex(FI).addImm(4);
-    }
-  } else
-    llvm_unreachable("Register class not handled!");
+    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
+  else if (RC == Mips::AFGR64RegisterClass)
+    Opc = Mips::LDC1;
+  else if (RC == Mips::FGR64RegisterClass)
+    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
+
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(0);
 }
 
 MachineInstr*
@@ -237,9 +226,12 @@ MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
 //===----------------------------------------------------------------------===//
 
 static unsigned GetAnalyzableBrOpc(unsigned Opc) {
-  return (Opc == Mips::BEQ  || Opc == Mips::BNE  || Opc == Mips::BGTZ ||
-          Opc == Mips::BGEZ || Opc == Mips::BLTZ || Opc == Mips::BLEZ ||
-          Opc == Mips::BC1T || Opc == Mips::BC1F || Opc == Mips::J) ? Opc : 0;
+  return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
+          Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
+          Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
+          Opc == Mips::BGEZ64 || Opc == Mips::BLTZ64 || Opc == Mips::BLEZ64 ||
+          Opc == Mips::BC1T   || Opc == Mips::BC1F   || Opc == Mips::J) ?
+         Opc : 0;
 }
 
 /// GetOppositeBranchOpc - Return the inverse of the specified
@@ -248,14 +240,20 @@ unsigned Mips::GetOppositeBranchOpc(unsigned Opc)
 {
   switch (Opc) {
   default: llvm_unreachable("Illegal opcode!");
-  case Mips::BEQ  : return Mips::BNE;
-  case Mips::BNE  : return Mips::BEQ;
-  case Mips::BGTZ : return Mips::BLEZ;
-  case Mips::BGEZ : return Mips::BLTZ;
-  case Mips::BLTZ : return Mips::BGEZ;
-  case Mips::BLEZ : return Mips::BGTZ;
-  case Mips::BC1T : return Mips::BC1F;
-  case Mips::BC1F : return Mips::BC1T;
+  case Mips::BEQ    : return Mips::BNE;
+  case Mips::BNE    : return Mips::BEQ;
+  case Mips::BGTZ   : return Mips::BLEZ;
+  case Mips::BGEZ   : return Mips::BLTZ;
+  case Mips::BLTZ   : return Mips::BGEZ;
+  case Mips::BLEZ   : return Mips::BGTZ;
+  case Mips::BEQ64  : return Mips::BNE64;
+  case Mips::BNE64  : return Mips::BEQ64;
+  case Mips::BGTZ64 : return Mips::BLEZ64;
+  case Mips::BGEZ64 : return Mips::BLTZ64;
+  case Mips::BLTZ64 : return Mips::BGEZ64;
+  case Mips::BLEZ64 : return Mips::BGTZ64;
+  case Mips::BC1T   : return Mips::BC1F;
+  case Mips::BC1F   : return Mips::BC1T;
   }
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 4421c48..271d248 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -72,12 +72,47 @@ namespace MipsII {
     /// MO_TPREL_HI/LO - Represents the hi and low part of the offset from
     // the thread pointer (Local Exec TLS).
     MO_TPREL_HI,
-    MO_TPREL_LO
+    MO_TPREL_LO,
+
+    // N32/64 Flags.
+    MO_GPOFF_HI,
+    MO_GPOFF_LO,
+    MO_GOT_DISP,
+    MO_GOT_PAGE,
+    MO_GOT_OFST
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for
+    // Mips instructions.
+    //
+
+    // Pseudo - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo   = 0,
+
+    /// FrmR - This form is for instructions of the format R.
+    FrmR  = 1,
+    /// FrmI - This form is for instructions of the format I.
+    FrmI  = 2,
+    /// FrmJ - This form is for instructions of the format J.
+    FrmJ  = 3,
+    /// FrmFR - This form is for instructions of the format FR.
+    FrmFR = 4,
+    /// FrmFI - This form is for instructions of the format FI.
+    FrmFI = 5,
+    /// FrmOther - This form is for instructions that have no specific format.
+    FrmOther = 6,
+
+    FormMask = 15
   };
 }
 
 class MipsInstrInfo : public MipsGenInstrInfo {
   MipsTargetMachine &TM;
+  bool IsN64;
   const MipsRegisterInfo RI;
 public:
   explicit MipsInstrInfo(MipsTargetMachine &TM);
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index d1a0587..06b7de7 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -34,13 +34,20 @@ def SDT_MipsMAddMSub     : SDTypeProfile<0, 4,
                                           SDTCisSameAs<1, 2>,
                                           SDTCisSameAs<2, 3>]>;
 def SDT_MipsDivRem       : SDTypeProfile<0, 2,
-                                         [SDTCisVT<0, i32>,
+                                         [SDTCisInt<0>,
                                           SDTCisSameAs<0, 1>]>;
 
 def SDT_MipsThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
 
 def SDT_MipsDynAlloc    : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
                                                SDTCisVT<1, iPTR>]>;
+def SDT_Sync             : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+
+def SDT_Ext : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>]>;
+def SDT_Ins : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                   SDTCisVT<2, i32>, SDTCisSameAs<2, 3>,
+                                   SDTCisSameAs<0, 4>]>;
 
 // Call
 def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
@@ -106,6 +113,11 @@ def MipsWrapperPIC    : SDNode<"MipsISD::WrapperPIC",  SDTIntUnaryOp>;
 def MipsDynAlloc  : SDNode<"MipsISD::DynAlloc", SDT_MipsDynAlloc,
                            [SDNPHasChain, SDNPInGlue]>;
 
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>;
+
+def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
+def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
@@ -113,8 +125,13 @@ def HasSEInReg  : Predicate<"Subtarget.hasSEInReg()">;
 def HasBitCount : Predicate<"Subtarget.hasBitCount()">;
 def HasSwap     : Predicate<"Subtarget.hasSwap()">;
 def HasCondMov  : Predicate<"Subtarget.hasCondMov()">;
-def IsMips32    : Predicate<"Subtarget.isMips32()">;
-def IsMips32r2  : Predicate<"Subtarget.isMips32r2()">;
+def HasMips32    : Predicate<"Subtarget.hasMips32()">;
+def HasMips32r2  : Predicate<"Subtarget.hasMips32r2()">;
+def HasMips64    : Predicate<"Subtarget.hasMips64()">;
+def NotMips64    : Predicate<"!Subtarget.hasMips64()">;
+def HasMips64r2  : Predicate<"Subtarget.hasMips64r2()">;
+def IsN64       : Predicate<"Subtarget.isABI_N64()">;
+def NotN64      : Predicate<"!Subtarget.isABI_N64()">;
 
 //===----------------------------------------------------------------------===//
 // Mips Operand, Complex Patterns and Transformations Definitions.
@@ -124,6 +141,7 @@ def IsMips32r2  : Predicate<"Subtarget.isMips32r2()">;
 def brtarget    : Operand<OtherVT>;
 def calltarget  : Operand<i32>;
 def simm16      : Operand<i32>;
+def simm16_64   : Operand<i64>;
 def shamt       : Operand<i32>;
 
 // Unsigned Operand
@@ -137,6 +155,11 @@ def mem : Operand<i32> {
   let MIOperandInfo = (ops CPURegs, simm16);
 }
 
+def mem64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU64Regs, simm16_64);
+}
+
 def mem_ea : Operand<i32> {
   let PrintMethod = "printMemOperandEA";
   let MIOperandInfo = (ops CPURegs, simm16);
@@ -177,36 +200,85 @@ def immZExt5 : PatLeaf<(imm), [{
 def addr : ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], []>;
 
 //===----------------------------------------------------------------------===//
+// Pattern fragment for load/store
+//===----------------------------------------------------------------------===//
+class UnalignedLoad<PatFrag Node> : PatFrag<(ops node:$ptr), (Node node:$ptr), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  return LD->getMemoryVT().getSizeInBits()/8 > LD->getAlignment();
+}]>;
+
+class AlignedLoad<PatFrag Node> : PatFrag<(ops node:$ptr), (Node node:$ptr), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  return LD->getMemoryVT().getSizeInBits()/8 <= LD->getAlignment();
+}]>;
+
+class UnalignedStore<PatFrag Node> : PatFrag<(ops node:$val, node:$ptr),
+                                             (Node node:$val, node:$ptr), [{
+  StoreSDNode *SD = cast<StoreSDNode>(N);
+  return SD->getMemoryVT().getSizeInBits()/8 > SD->getAlignment();
+}]>;
+
+class AlignedStore<PatFrag Node> : PatFrag<(ops node:$val, node:$ptr),
+                                           (Node node:$val, node:$ptr), [{
+  StoreSDNode *SD = cast<StoreSDNode>(N);
+  return SD->getMemoryVT().getSizeInBits()/8 <= SD->getAlignment();
+}]>;
+
+// Load/Store PatFrags.
+def sextloadi16_a   : AlignedLoad<sextloadi16>;
+def zextloadi16_a   : AlignedLoad<zextloadi16>;
+def extloadi16_a    : AlignedLoad<extloadi16>;
+def load_a          : AlignedLoad<load>;
+def sextloadi32_a   : AlignedLoad<sextloadi32>;
+def zextloadi32_a   : AlignedLoad<zextloadi32>;
+def extloadi32_a    : AlignedLoad<extloadi32>;
+def truncstorei16_a : AlignedStore<truncstorei16>;
+def store_a         : AlignedStore<store>;
+def truncstorei32_a : AlignedStore<truncstorei32>;
+def sextloadi16_u   : UnalignedLoad<sextloadi16>;
+def zextloadi16_u   : UnalignedLoad<zextloadi16>;
+def extloadi16_u    : UnalignedLoad<extloadi16>;
+def load_u          : UnalignedLoad<load>;
+def sextloadi32_u   : UnalignedLoad<sextloadi32>;
+def zextloadi32_u   : UnalignedLoad<zextloadi32>;
+def extloadi32_u    : UnalignedLoad<extloadi32>;
+def truncstorei16_u : UnalignedStore<truncstorei16>;
+def store_u         : UnalignedStore<store>;
+def truncstorei32_u : UnalignedStore<truncstorei32>;
+
+//===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
 
-// Arithmetic 3 register operands
-class ArithR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
-             InstrItinClass itin, bit isComm = 0>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin> {
+// Arithmetic and logical instructions with 3 register operands.
+class ArithLogicR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
+                  InstrItinClass itin, RegisterClass RC, bit isComm = 0>:
+  FR<op, func, (outs RC:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"),
+     [(set RC:$rd, (OpNode RC:$rs, RC:$rt))], itin> {
+  let shamt = 0;
   let isCommutable = isComm;
 }
 
 class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm,
-                     bit isComm = 0>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu> {
+                    InstrItinClass itin, RegisterClass RC, bit isComm = 0>:
+  FR<op, func, (outs RC:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"), [], itin> {
+  let shamt = 0;
   let isCommutable = isComm;
 }
 
-// Arithmetic 2 register operands
-class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
-             Operand Od, PatLeaf imm_type> :
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, imm_type:$c))], IIAlu>;
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<bits<6> op, string instr_asm, SDNode OpNode,
+                  Operand Od, PatLeaf imm_type, RegisterClass RC> :
+  FI<op, (outs RC:$rt), (ins RC:$rs, Od:$i),
+     !strconcat(instr_asm, "\t$rt, $rs, $i"),
+     [(set RC:$rt, (OpNode RC:$rs, imm_type:$i))], IIAlu>;
 
 class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
-             Operand Od, PatLeaf imm_type> :
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"), [], IIAlu>;
+                     Operand Od, PatLeaf imm_type, RegisterClass RC> :
+  FI<op, (outs RC:$rt), (ins RC:$rs, Od:$i),
+     !strconcat(instr_asm, "\t$rt, $rs, $i"), [], IIAlu>;
 
 // Arithmetic Multiply ADD/SUB
 let rd = 0, shamt = 0, Defs = [HI, LO], Uses = [HI, LO] in
@@ -214,92 +286,134 @@ class MArithR<bits<6> func, string instr_asm, SDNode op, bit isComm = 0> :
   FR<0x1c, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
      !strconcat(instr_asm, "\t$rs, $rt"),
      [(op CPURegs:$rs, CPURegs:$rt, LO, HI)], IIImul> {
+  let rd = 0;
+  let shamt = 0;
   let isCommutable = isComm;
 }
 
 //  Logical
-let isCommutable = 1 in
-class LogicR<bits<6> func, string instr_asm, SDNode OpNode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu>;
-
-class LogicI<bits<6> op, string instr_asm, SDNode OpNode>:
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, uimm16:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))], IIAlu>;
-
-let isCommutable = 1 in
-class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))], IIAlu>;
+class LogicNOR<bits<6> op, bits<6> func, string instr_asm, RegisterClass RC>:
+  FR<op, func, (outs RC:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"),
+     [(set RC:$rd, (not (or RC:$rs, RC:$rt)))], IIAlu> {
+  let shamt = 0;
+  let isCommutable = 1;
+}
 
 // Shifts
 class LogicR_shift_rotate_imm<bits<6> func, bits<5> _rs, string instr_asm,
                               SDNode OpNode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$b, shamt:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt5:$c))], IIAlu> {
+  FR<0x00, func, (outs CPURegs:$rd), (ins CPURegs:$rt, shamt:$shamt),
+     !strconcat(instr_asm, "\t$rd, $rt, $shamt"),
+     [(set CPURegs:$rd, (OpNode CPURegs:$rt, (i32 immZExt5:$shamt)))], IIAlu> {
   let rs = _rs;
 }
 
-class LogicR_shift_rotate_reg<bits<6> func, bits<5> _shamt, string instr_asm,
+class LogicR_shift_rotate_reg<bits<6> func, bits<5> isRotate, string instr_asm,
                               SDNode OpNode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$c, CPURegs:$b),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu> {
-  let shamt = _shamt;
+  FR<0x00, func, (outs CPURegs:$rd), (ins CPURegs:$rs, CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rd, $rt, $rs"),
+     [(set CPURegs:$rd, (OpNode CPURegs:$rt, CPURegs:$rs))], IIAlu> {
+  let shamt = isRotate;
 }
 
 // Load Upper Imediate
 class LoadUpper<bits<6> op, string instr_asm>:
-  FI< op,
-      (outs CPURegs:$dst),
-      (ins uimm16:$imm),
-      !strconcat(instr_asm, "\t$dst, $imm"),
-      [], IIAlu>;
+  FI<op, (outs CPURegs:$rt), (ins uimm16:$imm),
+     !strconcat(instr_asm, "\t$rt, $imm"), [], IIAlu> {
+  let rs = 0;
+}
 
 // Memory Load/Store
-let canFoldAsLoad = 1, hasDelaySlot = 1 in
-class LoadM<bits<6> op, string instr_asm, PatFrag OpNode>:
-  FI<op, (outs CPURegs:$dst), (ins mem:$addr),
-     !strconcat(instr_asm, "\t$dst, $addr"),
-     [(set CPURegs:$dst, (OpNode addr:$addr))], IILoad>;
+let canFoldAsLoad = 1 in
+class LoadM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
+            Operand MemOpnd, bit Pseudo>:
+  FI<op, (outs RC:$rt), (ins MemOpnd:$addr),
+     !strconcat(instr_asm, "\t$rt, $addr"),
+     [(set RC:$rt, (OpNode addr:$addr))], IILoad> {
+  let isPseudo = Pseudo;
+}
 
-class StoreM<bits<6> op, string instr_asm, PatFrag OpNode>:
-  FI<op, (outs), (ins CPURegs:$dst, mem:$addr),
-     !strconcat(instr_asm, "\t$dst, $addr"),
-     [(OpNode CPURegs:$dst, addr:$addr)], IIStore>;
+class StoreM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
+             Operand MemOpnd, bit Pseudo>:
+  FI<op, (outs), (ins RC:$rt, MemOpnd:$addr),
+     !strconcat(instr_asm, "\t$rt, $addr"),
+     [(OpNode RC:$rt, addr:$addr)], IIStore> {
+  let isPseudo = Pseudo;
+}
+
+// 32-bit load.
+multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
+                   bit Pseudo = 0> {
+  def #NAME# : LoadM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : LoadM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+} 
+
+// 64-bit load.
+multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode,
+                   bit Pseudo = 0> {
+  def #NAME# : LoadM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : LoadM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+} 
+
+// 32-bit store.
+multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
+                    bit Pseudo = 0> {
+  def #NAME# : StoreM<op, instr_asm, OpNode, CPURegs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : StoreM<op, instr_asm, OpNode, CPURegs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+}
+
+// 64-bit store.
+multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode,
+                    bit Pseudo = 0> {
+  def #NAME# : StoreM<op, instr_asm, OpNode, CPU64Regs, mem, Pseudo>,
+               Requires<[NotN64]>;
+  def _P8    : StoreM<op, instr_asm, OpNode, CPU64Regs, mem64, Pseudo>,
+               Requires<[IsN64]>;
+}
 
 // Conditional Branch
-let isBranch = 1, isTerminator=1, hasDelaySlot = 1 in {
-class CBranch<bits<6> op, string instr_asm, PatFrag cond_op>:
-  FI<op, (outs), (ins CPURegs:$a, CPURegs:$b, brtarget:$offset),
-     !strconcat(instr_asm, "\t$a, $b, $offset"),
-     [(brcond (cond_op CPURegs:$a, CPURegs:$b), bb:$offset)],
-     IIBranch>;
+class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>:
+  CBranchBase<op, (outs), (ins RC:$rs, RC:$rt, brtarget:$offset),
+              !strconcat(instr_asm, "\t$rs, $rt, $offset"),
+              [(brcond (i32 (cond_op RC:$rs, RC:$rt)), bb:$offset)], IIBranch> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
 
-class CBranchZero<bits<6> op, string instr_asm, PatFrag cond_op>:
-  FI<op, (outs), (ins CPURegs:$src, brtarget:$offset),
-     !strconcat(instr_asm, "\t$src, $offset"),
-     [(brcond (cond_op CPURegs:$src, 0), bb:$offset)],
-     IIBranch>;
+class CBranchZero<bits<6> op, bits<5> _rt, string instr_asm, PatFrag cond_op,
+                  RegisterClass RC>:
+  CBranchBase<op, (outs), (ins RC:$rs, brtarget:$offset),
+              !strconcat(instr_asm, "\t$rs, $offset"),
+              [(brcond (i32 (cond_op RC:$rs, 0)), bb:$offset)], IIBranch> {
+  let rt = _rt;
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
 }
 
 // SetCC
-class SetCC_R<bits<6> op, bits<6> func, string instr_asm,
-      PatFrag cond_op>:
-  FR<op, func, (outs CPURegs:$dst), (ins CPURegs:$b, CPURegs:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (cond_op CPURegs:$b, CPURegs:$c))],
-     IIAlu>;
+class SetCC_R<bits<6> op, bits<6> func, string instr_asm, PatFrag cond_op,
+              RegisterClass RC>:
+  FR<op, func, (outs CPURegs:$rd), (ins RC:$rs, RC:$rt),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"),
+     [(set CPURegs:$rd, (cond_op RC:$rs, RC:$rt))],
+     IIAlu> {
+  let shamt = 0;
+}
 
-class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op,
-      Operand Od, PatLeaf imm_type>:
-  FI<op, (outs CPURegs:$dst), (ins CPURegs:$b, Od:$c),
-     !strconcat(instr_asm, "\t$dst, $b, $c"),
-     [(set CPURegs:$dst, (cond_op CPURegs:$b, imm_type:$c))],
+class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op, Operand Od,
+              PatLeaf imm_type, RegisterClass RC>:
+  FI<op, (outs CPURegs:$rd), (ins RC:$rs, Od:$i),
+     !strconcat(instr_asm, "\t$rd, $rs, $i"),
+     [(set CPURegs:$rd, (cond_op RC:$rs, imm_type:$i))],
      IIAlu>;
 
 // Unconditional branch
@@ -310,8 +424,12 @@ class JumpFJ<bits<6> op, string instr_asm>:
 
 let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1 in
 class JumpFR<bits<6> op, bits<6> func, string instr_asm>:
-  FR<op, func, (outs), (ins CPURegs:$target),
-     !strconcat(instr_asm, "\t$target"), [(brind CPURegs:$target)], IIBranch>;
+  FR<op, func, (outs), (ins CPURegs:$rs),
+     !strconcat(instr_asm, "\t$rs"), [(brind CPURegs:$rs)], IIBranch> {
+  let rt = 0;
+  let rd = 0;
+  let shamt = 0;
+}
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1,
@@ -323,76 +441,124 @@ let isCall=1, hasDelaySlot=1,
        !strconcat(instr_asm, "\t$target"), [(MipsJmpLink imm:$target)],
        IIBranch>;
 
-  let rd=31 in
   class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm>:
     FR<op, func, (outs), (ins CPURegs:$rs, variable_ops),
-       !strconcat(instr_asm, "\t$rs"), [(MipsJmpLink CPURegs:$rs)], IIBranch>;
+       !strconcat(instr_asm, "\t$rs"), [(MipsJmpLink CPURegs:$rs)], IIBranch> {
+    let rt = 0;
+    let rd = 31;
+    let shamt = 0;
+  }
 
   class BranchLink<string instr_asm>:
     FI<0x1, (outs), (ins CPURegs:$rs, brtarget:$target, variable_ops),
-       !strconcat(instr_asm, "\t$rs, $target"), [], IIBranch>;
+       !strconcat(instr_asm, "\t$rs, $target"), [], IIBranch> {
+    let rt = 0;
+  }
 }
 
 // Mul, Div
-let Defs = [HI, LO] in {
-  let isCommutable = 1 in
-  class Mul<bits<6> func, string instr_asm, InstrItinClass itin>:
-    FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
-       !strconcat(instr_asm, "\t$a, $b"), [], itin>;
+class Mul<bits<6> func, string instr_asm, InstrItinClass itin>:
+  FR<0x00, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rs, $rt"), [], itin> {
+  let rd = 0;
+  let shamt = 0;
+  let isCommutable = 1;
+  let Defs = [HI, LO];
+}
 
-  class Div<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
-            FR<0x00, func, (outs), (ins CPURegs:$a, CPURegs:$b),
-            !strconcat(instr_asm, "\t$$zero, $a, $b"),
-            [(op CPURegs:$a, CPURegs:$b)], itin>;
+class Div<SDNode op, bits<6> func, string instr_asm, InstrItinClass itin>:
+          FR<0x00, func, (outs), (ins CPURegs:$rs, CPURegs:$rt),
+          !strconcat(instr_asm, "\t$$zero, $rs, $rt"),
+          [(op CPURegs:$rs, CPURegs:$rt)], itin> {
+  let rd = 0;
+  let shamt = 0;
+  let Defs = [HI, LO];
 }
 
 // Move from Hi/Lo
 class MoveFromLOHI<bits<6> func, string instr_asm>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins),
-     !strconcat(instr_asm, "\t$dst"), [], IIHiLo>;
+  FR<0x00, func, (outs CPURegs:$rd), (ins),
+     !strconcat(instr_asm, "\t$rd"), [], IIHiLo> {
+  let rs = 0;
+  let rt = 0;
+  let shamt = 0;
+}
 
 class MoveToLOHI<bits<6> func, string instr_asm>:
-  FR<0x00, func, (outs), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$src"), [], IIHiLo>;
+  FR<0x00, func, (outs), (ins CPURegs:$rs),
+     !strconcat(instr_asm, "\t$rs"), [], IIHiLo> {
+  let rt = 0;
+  let rd = 0;
+  let shamt = 0;
+}
 
 class EffectiveAddress<string instr_asm> :
-  FI<0x09, (outs CPURegs:$dst), (ins mem_ea:$addr),
-     instr_asm, [(set CPURegs:$dst, addr:$addr)], IIAlu>;
+  FI<0x09, (outs CPURegs:$rt), (ins mem_ea:$addr),
+     instr_asm, [(set CPURegs:$rt, addr:$addr)], IIAlu>;
 
 // Count Leading Ones/Zeros in Word
 class CountLeading<bits<6> func, string instr_asm, list<dag> pattern>:
-  FR<0x1c, func, (outs CPURegs:$dst), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$dst, $src"), pattern, IIAlu>,
+  FR<0x1c, func, (outs CPURegs:$rd), (ins CPURegs:$rs),
+     !strconcat(instr_asm, "\t$rd, $rs"), pattern, IIAlu>,
      Requires<[HasBitCount]> {
   let shamt = 0;
   let rt = rd;
 }
 
 // Sign Extend in Register.
-class SignExtInReg<bits<6> func, string instr_asm, ValueType vt>:
-  FR<0x3f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$dst, $src"),
-     [(set CPURegs:$dst, (sext_inreg CPURegs:$src, vt))], NoItinerary>;
+class SignExtInReg<bits<5> sa, string instr_asm, ValueType vt>:
+  FR<0x3f, 0x20, (outs CPURegs:$rd), (ins CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rd, $rt"),
+     [(set CPURegs:$rd, (sext_inreg CPURegs:$rt, vt))], NoItinerary> {
+  let rs = 0;
+  let shamt = sa;
+  let Predicates = [HasSEInReg];
+}
 
 // Byte Swap
-class ByteSwap<bits<6> func, string instr_asm>:
-  FR<0x1f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
-     !strconcat(instr_asm, "\t$dst, $src"),
-     [(set CPURegs:$dst, (bswap CPURegs:$src))], NoItinerary>;
-
-// Conditional Move
-class CondMov<bits<6> func, string instr_asm, PatLeaf MovCode>:
-  FR<0x00, func, (outs CPURegs:$dst), (ins CPURegs:$F, CPURegs:$T,
-     CPURegs:$cond), !strconcat(instr_asm, "\t$dst, $T, $cond"),
-     [], NoItinerary>;
+class ByteSwap<bits<6> func, bits<5> sa, string instr_asm>:
+  FR<0x1f, func, (outs CPURegs:$rd), (ins CPURegs:$rt),
+     !strconcat(instr_asm, "\t$rd, $rt"),
+     [(set CPURegs:$rd, (bswap CPURegs:$rt))], NoItinerary> {
+  let rs = 0;
+  let shamt = sa;
+  let Predicates = [HasSwap];
+}
 
 // Read Hardware
-class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$dst), (ins HWRegs:$src),
-    "rdhwr\t$dst, $src", [], IIAlu> {
+class ReadHardware: FR<0x1f, 0x3b, (outs CPURegs:$rt), (ins HWRegs:$rd),
+    "rdhwr\t$rt, $rd", [], IIAlu> {
   let rs = 0;
   let shamt = 0;
 }
 
+// Ext and Ins
+class ExtIns<bits<6> _funct, string instr_asm, dag outs, dag ins,
+             list<dag> pattern, InstrItinClass itin>:
+  FR<0x1f, _funct, outs, ins, !strconcat(instr_asm, " $rt, $rs, $pos, $sz"),
+     pattern, itin>, Requires<[HasMips32r2]> {
+  bits<5> pos;
+  bits<5> sz;
+  let rd = sz;
+  let shamt = pos;
+}
+
+// Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
+class Atomic2Ops<PatFrag Op, string Opstr> :
+  MipsPseudo<(outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
+             !strconcat("atomic_", Opstr, "\t$dst, $ptr, $incr"),
+             [(set CPURegs:$dst,
+              (Op CPURegs:$ptr, CPURegs:$incr))]>;
+
+// Atomic Compare & Swap.
+class AtomicCmpSwap<PatFrag Op, string Width> :
+  MipsPseudo<(outs CPURegs:$dst), 
+             (ins CPURegs:$ptr, CPURegs:$cmp, CPURegs:$swap),
+             !strconcat("atomic_cmp_swap_", Width, 
+                        "\t$dst, $ptr, $cmp, $swap"),
+             [(set CPURegs:$dst,
+              (Op CPURegs:$ptr, CPURegs:$cmp, CPURegs:$swap))]>;
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -427,112 +593,32 @@ def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
 def CPRESTORE : MipsPseudo<(outs), (ins i32imm:$loc), ".cprestore\t$loc", []>;
 
 let usesCustomInserter = 1 in {
-  def ATOMIC_LOAD_ADD_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_add_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_add_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_ADD_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_add_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_add_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_ADD_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_add_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_add_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_SUB_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_sub_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_sub_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_SUB_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_sub_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_sub_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_SUB_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_sub_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_sub_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_AND_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_and_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_and_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_AND_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_and_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_and_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_AND_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_and_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_and_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_OR_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_or_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_or_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_OR_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_or_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_or_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_OR_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_or_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_or_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_XOR_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_xor_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_xor_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_XOR_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_xor_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_xor_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_XOR_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_xor_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_xor_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_LOAD_NAND_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_nand_8\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_nand_8 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_NAND_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_nand_16\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_nand_16 CPURegs:$ptr, CPURegs:$incr))]>;
-  def ATOMIC_LOAD_NAND_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$incr),
-    "atomic_load_nand_32\t$dst, $ptr, $incr",
-    [(set CPURegs:$dst, (atomic_load_nand_32 CPURegs:$ptr, CPURegs:$incr))]>;
-
-  def ATOMIC_SWAP_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
-    "atomic_swap_8\t$dst, $ptr, $val",
-    [(set CPURegs:$dst, (atomic_swap_8 CPURegs:$ptr, CPURegs:$val))]>;
-  def ATOMIC_SWAP_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
-    "atomic_swap_16\t$dst, $ptr, $val",
-    [(set CPURegs:$dst, (atomic_swap_16 CPURegs:$ptr, CPURegs:$val))]>;
-  def ATOMIC_SWAP_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$val),
-    "atomic_swap_32\t$dst, $ptr, $val",
-    [(set CPURegs:$dst, (atomic_swap_32 CPURegs:$ptr, CPURegs:$val))]>;
-
-  def ATOMIC_CMP_SWAP_I8 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
-    "atomic_cmp_swap_8\t$dst, $ptr, $oldval, $newval",
-    [(set CPURegs:$dst,
-         (atomic_cmp_swap_8 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
-  def ATOMIC_CMP_SWAP_I16 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
-    "atomic_cmp_swap_16\t$dst, $ptr, $oldval, $newval",
-    [(set CPURegs:$dst,
-         (atomic_cmp_swap_16 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
-  def ATOMIC_CMP_SWAP_I32 : MipsPseudo<
-    (outs CPURegs:$dst), (ins CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval),
-    "atomic_cmp_swap_32\t$dst, $ptr, $oldval, $newval",
-    [(set CPURegs:$dst,
-         (atomic_cmp_swap_32 CPURegs:$ptr, CPURegs:$oldval, CPURegs:$newval))]>;
+  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, "load_add_8">;
+  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, "load_add_16">;
+  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, "load_add_32">;
+  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, "load_sub_8">;
+  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, "load_sub_16">;
+  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, "load_sub_32">;
+  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, "load_and_8">;
+  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, "load_and_16">;
+  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, "load_and_32">;
+  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, "load_or_8">;
+  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, "load_or_16">;
+  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, "load_or_32">;
+  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, "load_xor_8">;
+  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, "load_xor_16">;
+  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, "load_xor_32">;
+  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, "load_nand_8">;
+  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, "load_nand_16">;
+  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, "load_nand_32">;
+
+  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, "swap_8">;
+  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, "swap_16">;
+  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, "swap_32">;
+
+  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, "8">;
+  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, "16">;
+  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, "32">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -544,26 +630,26 @@ let usesCustomInserter = 1 in {
 //===----------------------------------------------------------------------===//
 
 /// Arithmetic Instructions (ALU Immediate)
-def ADDiu   : ArithI<0x09, "addiu", add, simm16, immSExt16>;
-def ADDi    : ArithOverflowI<0x08, "addi",  add, simm16, immSExt16>;
-def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16>;
-def SLTiu   : SetCC_I<0x0b, "sltiu", setult, simm16, immSExt16>;
-def ANDi    : LogicI<0x0c, "andi", and>;
-def ORi     : LogicI<0x0d, "ori",  or>;
-def XORi    : LogicI<0x0e, "xori",  xor>;
+def ADDiu   : ArithLogicI<0x09, "addiu", add, simm16, immSExt16, CPURegs>;
+def ADDi    : ArithOverflowI<0x08, "addi", add, simm16, immSExt16, CPURegs>;
+def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16, CPURegs>;
+def SLTiu   : SetCC_I<0x0b, "sltiu", setult, simm16, immSExt16, CPURegs>;
+def ANDi    : ArithLogicI<0x0c, "andi", and, uimm16, immZExt16, CPURegs>;
+def ORi     : ArithLogicI<0x0d, "ori", or, uimm16, immZExt16, CPURegs>;
+def XORi    : ArithLogicI<0x0e, "xori", xor, uimm16, immZExt16, CPURegs>;
 def LUi     : LoadUpper<0x0f, "lui">;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu, 1>;
-def SUBu    : ArithR<0x00, 0x23, "subu", sub, IIAlu>;
-def ADD     : ArithOverflowR<0x00, 0x20, "add", 1>;
-def SUB     : ArithOverflowR<0x00, 0x22, "sub">;
-def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt>;
-def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult>;
-def AND     : LogicR<0x24, "and", and>;
-def OR      : LogicR<0x25, "or",  or>;
-def XOR     : LogicR<0x26, "xor", xor>;
-def NOR     : LogicNOR<0x00, 0x27, "nor">;
+def ADDu    : ArithLogicR<0x00, 0x21, "addu", add, IIAlu, CPURegs, 1>;
+def SUBu    : ArithLogicR<0x00, 0x23, "subu", sub, IIAlu, CPURegs>;
+def ADD     : ArithOverflowR<0x00, 0x20, "add", IIAlu, CPURegs, 1>;
+def SUB     : ArithOverflowR<0x00, 0x22, "sub", IIAlu, CPURegs>;
+def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt, CPURegs>;
+def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult, CPURegs>;
+def AND     : ArithLogicR<0x00, 0x24, "and", and, IIAlu, CPURegs, 1>;
+def OR      : ArithLogicR<0x00, 0x25, "or",  or, IIAlu, CPURegs, 1>;
+def XOR     : ArithLogicR<0x00, 0x26, "xor", xor, IIAlu, CPURegs, 1>;
+def NOR     : LogicNOR<0x00, 0x27, "nor", CPURegs>;
 
 /// Shift Instructions
 def SLL     : LogicR_shift_rotate_imm<0x00, 0x00, "sll", shl>;
@@ -574,45 +660,58 @@ def SRLV    : LogicR_shift_rotate_reg<0x06, 0x00, "srlv", srl>;
 def SRAV    : LogicR_shift_rotate_reg<0x07, 0x00, "srav", sra>;
 
 // Rotate Instructions
-let Predicates = [IsMips32r2] in {
+let Predicates = [HasMips32r2] in {
     def ROTR    : LogicR_shift_rotate_imm<0x02, 0x01, "rotr", rotr>;
     def ROTRV   : LogicR_shift_rotate_reg<0x06, 0x01, "rotrv", rotr>;
 }
 
 /// Load and Store Instructions
-def LB      : LoadM<0x20, "lb",  sextloadi8>;
-def LBu     : LoadM<0x24, "lbu", zextloadi8>;
-def LH      : LoadM<0x21, "lh",  sextloadi16>;
-def LHu     : LoadM<0x25, "lhu", zextloadi16>;
-def LW      : LoadM<0x23, "lw",  load>;
-def SB      : StoreM<0x28, "sb", truncstorei8>;
-def SH      : StoreM<0x29, "sh", truncstorei16>;
-def SW      : StoreM<0x2b, "sw", store>;
+///  aligned
+defm LB      : LoadM32<0x20, "lb",  sextloadi8>;
+defm LBu     : LoadM32<0x24, "lbu", zextloadi8>;
+defm LH      : LoadM32<0x21, "lh",  sextloadi16_a>;
+defm LHu     : LoadM32<0x25, "lhu", zextloadi16_a>;
+defm LW      : LoadM32<0x23, "lw",  load_a>;
+defm SB      : StoreM32<0x28, "sb", truncstorei8>;
+defm SH      : StoreM32<0x29, "sh", truncstorei16_a>;
+defm SW      : StoreM32<0x2b, "sw", store_a>;
+
+///  unaligned
+defm ULH     : LoadM32<0x21, "ulh",  sextloadi16_u, 1>;
+defm ULHu    : LoadM32<0x25, "ulhu", zextloadi16_u, 1>;
+defm ULW     : LoadM32<0x23, "ulw",  load_u, 1>;
+defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
+defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
+
+let hasSideEffects = 1 in
+def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
+                    [(MipsSync imm:$stype)], NoItinerary>
+{
+  let opcode = 0;
+  let Inst{25-11} = 0;
+  let Inst{5-0} = 15;
+}
 
 /// Load-linked, Store-conditional
-let hasDelaySlot = 1 in
+let mayLoad = 1 in
   def LL    : FI<0x30, (outs CPURegs:$dst), (ins mem:$addr),
               "ll\t$dst, $addr", [], IILoad>;
-let Constraints = "$src = $dst" in
+let mayStore = 1, Constraints = "$src = $dst" in
   def SC    : FI<0x38, (outs CPURegs:$dst), (ins CPURegs:$src, mem:$addr),
               "sc\t$src, $addr", [], IIStore>;
 
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
-def JR      : JumpFR<0x00, 0x08, "jr">;
+let isIndirectBranch = 1 in
+  def JR      : JumpFR<0x00, 0x08, "jr">;
 def JAL     : JumpLink<0x03, "jal">;
 def JALR    : JumpLinkReg<0x00, 0x09, "jalr">;
-def BEQ     : CBranch<0x04, "beq", seteq>;
-def BNE     : CBranch<0x05, "bne", setne>;
-
-let rt=1 in
-  def BGEZ  : CBranchZero<0x01, "bgez", setge>;
-
-let rt=0 in {
-  def BGTZ  : CBranchZero<0x07, "bgtz", setgt>;
-  def BLEZ  : CBranchZero<0x07, "blez", setle>;
-  def BLTZ  : CBranchZero<0x01, "bltz", setlt>;
-}
+def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
+def BNE     : CBranch<0x05, "bne", setne, CPURegs>;
+def BGEZ    : CBranchZero<0x01, 1, "bgez", setge, CPURegs>;
+def BGTZ    : CBranchZero<0x07, 0, "bgtz", setgt, CPURegs>;
+def BLEZ    : CBranchZero<0x07, 0, "blez", setle, CPURegs>;
+def BLTZ    : CBranchZero<0x01, 0, "bltz", setlt, CPURegs>;
 
 def BGEZAL  : BranchLink<"bgezal">;
 def BLTZAL  : BranchLink<"bltzal">;
@@ -639,40 +738,31 @@ let Uses = [LO] in
   def MFLO  : MoveFromLOHI<0x12, "mflo">;
 
 /// Sign Ext In Register Instructions.
-let Predicates = [HasSEInReg] in {
-  let shamt = 0x10, rs = 0 in
-    def SEB : SignExtInReg<0x21, "seb", i8>;
-
-  let shamt = 0x18, rs = 0 in
-    def SEH : SignExtInReg<0x20, "seh", i16>;
-}
+def SEB : SignExtInReg<0x10, "seb", i8>;
+def SEH : SignExtInReg<0x18, "seh", i16>;
 
 /// Count Leading
-def CLZ : CountLeading<0b100000, "clz",
-                       [(set CPURegs:$dst, (ctlz CPURegs:$src))]>;
-def CLO : CountLeading<0b100001, "clo",
-                       [(set CPURegs:$dst, (ctlz (not CPURegs:$src)))]>;
+def CLZ : CountLeading<0x20, "clz",
+                       [(set CPURegs:$rd, (ctlz CPURegs:$rs))]>;
+def CLO : CountLeading<0x21, "clo",
+                       [(set CPURegs:$rd, (ctlz (not CPURegs:$rs)))]>;
 
 /// Byte Swap
-let Predicates = [HasSwap] in {
-  let shamt = 0x3, rs = 0 in
-    def WSBW : ByteSwap<0x20, "wsbw">;
-}
-
-/// Conditional Move
-def MIPS_CMOV_ZERO  : PatLeaf<(i32 0)>;
-def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>;
+def WSBW : ByteSwap<0x20, 0x2, "wsbw">;
 
 // Conditional moves:
 // These instructions are expanded in
 // MipsISelLowering::EmitInstrWithCustomInserter if target does not have
 // conditional move instructions.
 // flag:int, data:int
-let usesCustomInserter = 1, shamt = 0, Constraints = "$F = $dst" in
-  class CondMovIntInt<bits<6> funct, string instr_asm> :
-    FR<0, funct, (outs CPURegs:$dst),
-       (ins CPURegs:$T, CPURegs:$cond, CPURegs:$F),
-       !strconcat(instr_asm, "\t$dst, $T, $cond"), [], NoItinerary>;
+class CondMovIntInt<bits<6> funct, string instr_asm> :
+  FR<0, funct, (outs CPURegs:$rd),
+     (ins CPURegs:$rs, CPURegs:$rt, CPURegs:$F),
+     !strconcat(instr_asm, "\t$rd, $rs, $rt"), [], NoItinerary> {
+  let shamt = 0;
+  let usesCustomInserter = 1;
+  let Constraints = "$F = $rd";
+}
 
 def MOVZ_I : CondMovIntInt<0x0a, "movz">;
 def MOVN_I : CondMovIntInt<0x0b, "movn">;
@@ -685,13 +775,13 @@ let addr=0 in
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu\t$dst, $addr">;
+def LEA_ADDiu : EffectiveAddress<"addiu\t$rt, $addr">;
 
 // DynAlloc node points to dynamically allocated stack space.
 // $sp is added to the list of implicitly used registers to prevent dead code
 // elimination from removing instructions that modify $sp.
 let Uses = [SP] in
-def DynAlloc : EffectiveAddress<"addiu\t$dst, $addr">;
+def DynAlloc : EffectiveAddress<"addiu\t$rt, $addr">;
 
 // MADD*/MSUB*
 def MADD  : MArithR<0, "madd", MipsMAdd, 1>;
@@ -701,10 +791,25 @@ def MSUBU : MArithR<5, "msubu", MipsMSubu>;
 
 // MUL is a assembly macro in the current used ISAs. In recent ISA's
 // it is a real instruction.
-def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul, 1>, Requires<[IsMips32]>;
+def MUL   : ArithLogicR<0x1c, 0x02, "mul", mul, IIImul, CPURegs, 1>,
+            Requires<[HasMips32]>;
 
 def RDHWR : ReadHardware;
 
+def EXT : ExtIns<0, "ext", (outs CPURegs:$rt),
+                 (ins CPURegs:$rs, uimm16:$pos, uimm16:$sz),
+                 [(set CPURegs:$rt,
+                   (MipsExt CPURegs:$rs, immZExt5:$pos, immZExt5:$sz))],
+                 NoItinerary>;
+
+let Constraints = "$src = $rt" in
+def INS : ExtIns<4, "ins", (outs CPURegs:$rt),
+                 (ins CPURegs:$rs, uimm16:$pos, uimm16:$sz, CPURegs:$src),
+                 [(set CPURegs:$rt,
+                   (MipsIns CPURegs:$rs, immZExt5:$pos, immZExt5:$sz,
+                    CPURegs:$src))],
+                 NoItinerary>;
+
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
@@ -738,16 +843,20 @@ def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
 // hi/lo relocs
 def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
 def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
+def : Pat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
           (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
           (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
 
 def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
           (ADDiu CPURegs:$hi, tjumptable:$lo)>;
 
 def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
           (ADDiu CPURegs:$hi, tconstpool:$lo)>;
 
@@ -763,6 +872,7 @@ def : Pat<(add CPURegs:$gp, (MipsTlsGd tglobaltlsaddr:$in)),
 
 // tprel hi/lo
 def : Pat<(MipsTprelHi tglobaltlsaddr:$in), (LUi tglobaltlsaddr:$in)>;
+def : Pat<(MipsTprelLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsTprelLo tglobaltlsaddr:$lo)),
           (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
 
@@ -784,60 +894,67 @@ def : Pat<(not CPURegs:$in),
 // extended load and stores
 def : Pat<(extloadi1  addr:$src), (LBu addr:$src)>;
 def : Pat<(extloadi8  addr:$src), (LBu addr:$src)>;
-def : Pat<(extloadi16 addr:$src), (LHu addr:$src)>;
+def : Pat<(extloadi16_a addr:$src), (LHu addr:$src)>;
+def : Pat<(extloadi16_u addr:$src), (ULHu addr:$src)>;
 
 // peepholes
 def : Pat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 
 // brcond patterns
-def : Pat<(brcond (setne CPURegs:$lhs, 0), bb:$dst),
-          (BNE CPURegs:$lhs, ZERO, bb:$dst)>;
-def : Pat<(brcond (seteq CPURegs:$lhs, 0), bb:$dst),
-          (BEQ CPURegs:$lhs, ZERO, bb:$dst)>;
-
-def : Pat<(brcond (setge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setuge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
-          (BEQ (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setuge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
-          (BEQ (SLTiu CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
-
-def : Pat<(brcond (setle CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
-def : Pat<(brcond (setule CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
-          (BEQ (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
-
-def : Pat<(brcond CPURegs:$cond, bb:$dst),
-          (BNE CPURegs:$cond, ZERO, bb:$dst)>;
+multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
+                      Instruction SLTOp, Instruction SLTuOp, Instruction SLTiOp,
+                      Instruction SLTiuOp, Register ZEROReg> {
+def : Pat<(brcond (i32 (setne RC:$lhs, 0)), bb:$dst),
+          (BNEOp RC:$lhs, ZEROReg, bb:$dst)>;
+def : Pat<(brcond (i32 (seteq RC:$lhs, 0)), bb:$dst),
+          (BEQOp RC:$lhs, ZEROReg, bb:$dst)>;
+
+def : Pat<(brcond (i32 (setge RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setuge RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTuOp RC:$lhs, RC:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+          (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
+          (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (i32 (setule RC:$lhs, RC:$rhs)), bb:$dst),
+          (BEQ (SLTuOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond RC:$cond, bb:$dst),
+          (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
+}
+
+defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
 
 // select patterns
 multiclass MovzPats<RegisterClass RC, Instruction MOVZInst> {
-  def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setge CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLT CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
-  def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setuge CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
-  def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setge CPURegs:$lhs, immSExt16:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs), RC:$F)>;
-  def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setuge CPURegs:$lh, immSExt16:$rh)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTiu CPURegs:$lh, immSExt16:$rh), RC:$F)>;
-  def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setle CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLT CPURegs:$rhs, CPURegs:$lhs), RC:$F)>;
-  def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setule CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs), RC:$F)>;
-  def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (seteq CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVZInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
-  def : Pat<(select (seteq CPURegs:$lhs, 0), RC:$T, RC:$F),
+  def : Pat<(select (i32 (seteq CPURegs:$lhs, 0)), RC:$T, RC:$F),
             (MOVZInst RC:$T, CPURegs:$lhs, RC:$F)>;
 }
 
 multiclass MovnPats<RegisterClass RC, Instruction MOVNInst> {
-  def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setne CPURegs:$lhs, CPURegs:$rhs)), RC:$T, RC:$F),
             (MOVNInst RC:$T, (XOR CPURegs:$lhs, CPURegs:$rhs), RC:$F)>;
   def : Pat<(select CPURegs:$cond, RC:$T, RC:$F),
             (MOVNInst RC:$T, CPURegs:$cond, RC:$F)>;
-  def : Pat<(select (setne CPURegs:$lhs, 0), RC:$T, RC:$F),
+  def : Pat<(select (i32 (setne CPURegs:$lhs, 0)), RC:$T, RC:$F),
             (MOVNInst RC:$T, CPURegs:$lhs, RC:$F)>;
 }
 
@@ -845,30 +962,48 @@ defm : MovzPats<CPURegs, MOVZ_I>;
 defm : MovnPats<CPURegs, MOVN_I>;
 
 // setcc patterns
-def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs),
-          (SLTu (XOR CPURegs:$lhs, CPURegs:$rhs), 1)>;
-def : Pat<(setne CPURegs:$lhs, CPURegs:$rhs),
-          (SLTu ZERO, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
-
-def : Pat<(setle CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLT CPURegs:$rhs, CPURegs:$lhs), 1)>;
-def : Pat<(setule CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLTu CPURegs:$rhs, CPURegs:$lhs), 1)>;
-
-def : Pat<(setgt CPURegs:$lhs, CPURegs:$rhs),
-          (SLT CPURegs:$rhs, CPURegs:$lhs)>;
-def : Pat<(setugt CPURegs:$lhs, CPURegs:$rhs),
-          (SLTu CPURegs:$rhs, CPURegs:$lhs)>;
-
-def : Pat<(setge CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLT CPURegs:$lhs, CPURegs:$rhs), 1)>;
-def : Pat<(setuge CPURegs:$lhs, CPURegs:$rhs),
-          (XORi (SLTu CPURegs:$lhs, CPURegs:$rhs), 1)>;
-
-def : Pat<(setge CPURegs:$lhs, immSExt16:$rhs),
-          (XORi (SLTi CPURegs:$lhs, immSExt16:$rhs), 1)>;
-def : Pat<(setuge CPURegs:$lhs, immSExt16:$rhs),
-          (XORi (SLTiu CPURegs:$lhs, immSExt16:$rhs), 1)>;
+multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
+                     Instruction SLTuOp, Register ZEROReg> {
+  def : Pat<(seteq RC:$lhs, RC:$rhs),
+            (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
+  def : Pat<(setne RC:$lhs, RC:$rhs),
+            (SLTuOp ZEROReg, (XOROp RC:$lhs, RC:$rhs))>;
+}
+
+multiclass SetlePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : Pat<(setle RC:$lhs, RC:$rhs),
+            (XORi (SLTOp RC:$rhs, RC:$lhs), 1)>;
+  def : Pat<(setule RC:$lhs, RC:$rhs),
+            (XORi (SLTuOp RC:$rhs, RC:$lhs), 1)>;
+}
+
+multiclass SetgtPats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : Pat<(setgt RC:$lhs, RC:$rhs),
+            (SLTOp RC:$rhs, RC:$lhs)>;
+  def : Pat<(setugt RC:$lhs, RC:$rhs),
+            (SLTuOp RC:$rhs, RC:$lhs)>;
+}
+
+multiclass SetgePats<RegisterClass RC, Instruction SLTOp, Instruction SLTuOp> {
+  def : Pat<(setge RC:$lhs, RC:$rhs),
+            (XORi (SLTOp RC:$lhs, RC:$rhs), 1)>;
+  def : Pat<(setuge RC:$lhs, RC:$rhs),
+            (XORi (SLTuOp RC:$lhs, RC:$rhs), 1)>;
+}
+
+multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp,
+                        Instruction SLTiuOp> {
+  def : Pat<(setge RC:$lhs, immSExt16:$rhs),
+            (XORi (SLTiOp RC:$lhs, immSExt16:$rhs), 1)>;
+  def : Pat<(setuge RC:$lhs, immSExt16:$rhs),
+            (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
+}
+
+defm : SeteqPats<CPURegs, SLTiu, XOR, SLTu, ZERO>;
+defm : SetlePats<CPURegs, SLT, SLTu>;
+defm : SetgtPats<CPURegs, SLT, SLTu>;
+defm : SetgePats<CPURegs, SLT, SLTu>;
+defm : SetgeImmPats<CPURegs, SLTi, SLTiu>;
 
 // select MipsDynAlloc
 def : Pat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
@@ -878,4 +1013,5 @@ def : Pat<(MipsDynAlloc addr:$f), (DynAlloc addr:$f)>;
 //===----------------------------------------------------------------------===//
 
 include "MipsInstrFPU.td"
+include "Mips64InstrInfo.td"
 
diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
new file mode 100644
index 0000000..28c2b48
--- /dev/null
+++ b/lib/Target/Mips/MipsJITInfo.cpp
@@ -0,0 +1,230 @@
+//===- MipsJITInfo.cpp - Implement the JIT interfaces for the Mips target -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Mips target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "MipsJITInfo.h"
+#include "MipsInstrInfo.h"
+#include "MipsRelocations.h"
+#include "MipsSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Memory.h"
+#include <cstdlib>
+using namespace llvm;
+
+
+void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  report_fatal_error("MipsJITInfo::replaceMachineCodeForFunction");
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because the prolog/epilog inserted by GCC won't work for us. Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring. This code saves registers, calls
+// MipsCompilationCallbackC and restores registers.
+extern "C" {
+#if defined (__mips__)
+void MipsCompilationCallback();
+
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "MipsCompilationCallback\n"
+    ASMPREFIX "MipsCompilationCallback:\n"
+    ".ent " ASMPREFIX "MipsCompilationCallback\n"
+    ".frame  $29, 32, $31\n"
+    ".set  noreorder\n"
+    ".cpload $t9\n"
+
+    "addiu $sp, $sp, -60\n"
+    ".cprestore 16\n"
+
+    // Save argument registers a0, a1, a2, a3, f12, f14 since they may contain
+    // stuff for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned. We also need to save the ra register since it contains the
+    // original return address, and t8 register since it contains the address
+    // of the end of function stub.
+    "sw $a0, 20($sp)\n"
+    "sw $a1, 24($sp)\n"
+    "sw $a2, 28($sp)\n"
+    "sw $a3, 32($sp)\n"
+    "sw $ra, 36($sp)\n"
+    "sw $t8, 40($sp)\n"
+    "sdc1 $f12, 44($sp)\n"
+    "sdc1 $f14, 52($sp)\n"
+
+    // t8 points at the end of function stub. Pass the beginning of the stub
+    // to the MipsCompilationCallbackC.
+    "addiu $a0, $t8, -16\n"
+    "jal " ASMPREFIX "MipsCompilationCallbackC\n"
+    "nop\n"
+
+    // Restore registers.
+    "lw $a0, 20($sp)\n"
+    "lw $a1, 24($sp)\n"
+    "lw $a2, 28($sp)\n"
+    "lw $a3, 32($sp)\n"
+    "lw $ra, 36($sp)\n"
+    "lw $t8, 40($sp)\n"
+    "ldc1 $f12, 44($sp)\n"
+    "ldc1 $f14, 52($sp)\n"
+    "addiu $sp, $sp, 60\n"
+
+    // Jump to the (newly modified) stub to invoke the real function.
+    "addiu $t8, $t8, -16\n"
+    "jr $t8\n"
+    "nop\n"
+
+    ".set  reorder\n"
+    ".end " ASMPREFIX "MipsCompilationCallback\n"
+      );
+#else  // host != Mips
+  void MipsCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call MipsCompilationCallback() on a non-Mips arch!");
+  }
+#endif
+}
+
+/// MipsCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
+/// it into the JIT compiler function.
+extern "C" void MipsCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the first four instructions of the
+  // stub with code that jumps to the compiled function:
+  //   lui $t9, %hi(NewVal)
+  //   addiu $t9, $t9, %lo(NewVal)
+  //   jr $t9
+  //   nop
+
+  int Hi = ((unsigned)NewVal & 0xffff0000) >> 16;
+  if ((NewVal & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(NewVal & 0xffff);
+
+  *(intptr_t *)(StubAddr) = 0xf << 26 | 25 << 16 | Hi;
+  *(intptr_t *)(StubAddr + 4) = 9 << 26 | 25 << 21 | 25 << 16 | Lo;
+  *(intptr_t *)(StubAddr + 8) = 25 << 21 | 8;
+  *(intptr_t *)(StubAddr + 12) = 0;
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 16);
+}
+
+TargetJITInfo::LazyResolverFn MipsJITInfo::getLazyResolverFunction(
+    JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return MipsCompilationCallback;
+}
+
+TargetJITInfo::StubLayout MipsJITInfo::getStubLayout() {
+  // The stub contains 4 4-byte instructions, aligned at 4 bytes. See
+  // emitFunctionStub for details.
+  StubLayout Result = { 4*4, 4 };
+  return Result;
+}
+
+void *MipsJITInfo::emitFunctionStub(const Function* F, void *Fn,
+    JITCodeEmitter &JCE) {
+  JCE.emitAlignment(4);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+  if (!sys::Memory::setRangeWritable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  intptr_t EmittedAddr;
+  if (Fn != (void*)(intptr_t)MipsCompilationCallback)
+    EmittedAddr = (intptr_t)Fn;
+  else
+    EmittedAddr = (intptr_t)MipsCompilationCallback;
+
+
+  int Hi = ((unsigned)EmittedAddr & 0xffff0000) >> 16;
+  if ((EmittedAddr & 0x8000) != 0)
+    Hi++;
+  int Lo = (int)(EmittedAddr & 0xffff);
+
+  // lui t9, %hi(EmittedAddr)
+  // addiu t9, t9, %lo(EmittedAddr)
+  // jalr t8, t9
+  // nop
+  JCE.emitWordLE(0xf << 26 | 25 << 16 | Hi);
+  JCE.emitWordLE(9 << 26 | 25 << 21 | 25 << 16 | Lo);
+  JCE.emitWordLE(25 << 21 | 24 << 11 | 9);
+  JCE.emitWordLE(0);
+
+  sys::Memory::InvalidateInstructionCache(Addr, 16);
+  if (!sys::Memory::setRangeExecutable(Addr, 16))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void MipsJITInfo::relocate(void *Function, MachineRelocation *MR,
+    unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((Mips::RelocationType) MR->getRelocationType()) {
+    case Mips::reloc_mips_branch:
+      ResultPtr = (((ResultPtr - (intptr_t) RelocPos) - 4) >> 2) & 0xffff;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_26:
+      ResultPtr = (ResultPtr & 0x0fffffff) >> 2;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_hi:
+      ResultPtr = ResultPtr >> 16;
+      if ((((intptr_t) (MR->getResultPointer()) & 0xffff) >> 15) == 1) {
+        ResultPtr += 1;
+      }
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    case Mips::reloc_mips_lo:
+      ResultPtr = ResultPtr & 0xffff;
+      *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+      break;
+
+    default:
+      llvm_unreachable("ERROR: Unknown Mips relocation.");
+    }
+  }
+}
diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
new file mode 100644
index 0000000..41f32a3
--- /dev/null
+++ b/lib/Target/Mips/MipsJITInfo.h
@@ -0,0 +1,70 @@
+//===- MipsJITInfo.h - Mips implementation of the JIT interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSJITINFO_H
+#define MIPSJITINFO_H
+
+#include "MipsMachineFunction.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+class MipsTargetMachine;
+
+class MipsJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+
+  public:
+    explicit MipsJITInfo() :
+      IsPIC(false) {}
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on Mips.
+    virtual StubLayout getStubLayout();
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+        JITCodeEmitter &JCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+        unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    void Initialize(const MachineFunction &MF, bool isPIC) {
+      IsPIC = isPIC;
+    }
+
+};
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index f5cc3aa..608a7d2 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -29,10 +29,10 @@ MipsMCInstLower::MipsMCInstLower(Mangler *mang, const MachineFunction &mf,
   : Ctx(mf.getContext()), Mang(mang), AsmPrinter(asmprinter) {}
 
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
-                                              MachineOperandType MOTy) const {
+                                              MachineOperandType MOTy,
+                                              unsigned Offset) const {
   MipsMCSymbolRefExpr::VariantKind Kind;
   const MCSymbol *Symbol;
-  int Offset = 0;
 
   switch(MO.getTargetFlags()) {
   default:                  assert(0 && "Invalid target flag!");
@@ -46,6 +46,11 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case MipsII::MO_GOTTPREL: Kind = MipsMCSymbolRefExpr::VK_Mips_GOTTPREL; break;
   case MipsII::MO_TPREL_HI: Kind = MipsMCSymbolRefExpr::VK_Mips_TPREL_HI; break;
   case MipsII::MO_TPREL_LO: Kind = MipsMCSymbolRefExpr::VK_Mips_TPREL_LO; break;
+  case MipsII::MO_GPOFF_HI: Kind = MipsMCSymbolRefExpr::VK_Mips_GPOFF_HI; break;
+  case MipsII::MO_GPOFF_LO: Kind = MipsMCSymbolRefExpr::VK_Mips_GPOFF_LO; break;
+  case MipsII::MO_GOT_DISP: Kind = MipsMCSymbolRefExpr::VK_Mips_GOT_DISP; break;
+  case MipsII::MO_GOT_PAGE: Kind = MipsMCSymbolRefExpr::VK_Mips_GOT_PAGE; break;
+  case MipsII::MO_GOT_OFST: Kind = MipsMCSymbolRefExpr::VK_Mips_GOT_OFST; break;
   }
 
   switch (MOTy) {
@@ -72,7 +77,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     case MachineOperand::MO_ConstantPoolIndex:
       Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
       if (MO.getOffset())
-        Offset = MO.getOffset();  
+        Offset += MO.getOffset();  
       break;
 
     default:
@@ -83,36 +88,39 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                            Ctx));
 }
 
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO) const {
+  MachineOperandType MOTy = MO.getType();
+  
+  switch (MOTy) {
+  default:
+    assert(0 && "unknown operand type");
+    break;
+  case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
+    if (MO.isImplicit()) break;
+    return MCOperand::CreateReg(MO.getReg());
+  case MachineOperand::MO_Immediate:
+    return MCOperand::CreateImm(MO.getImm());
+  case MachineOperand::MO_MachineBasicBlock:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_BlockAddress:
+    return LowerSymbolOperand(MO, MOTy, 0);
+ }
+
+  return MCOperand();
+}
+
 void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
   
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    MCOperand MCOp;
-    MachineOperandType MOTy = MO.getType();
+    MCOperand MCOp = LowerOperand(MO);
 
-    switch (MOTy) {
-    default:
-      MI->dump();
-      llvm_unreachable("unknown operand type");
-    case MachineOperand::MO_Register:
-      // Ignore all implicit register operands.
-      if (MO.isImplicit()) continue;
-      MCOp = MCOperand::CreateReg(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      MCOp = MCOperand::CreateImm(MO.getImm());
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_ExternalSymbol:
-    case MachineOperand::MO_JumpTableIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_BlockAddress:
-      MCOp = LowerSymbolOperand(MO, MOTy);
-      break;
-    }
-    
-    OutMI.addOperand(MCOp);
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
   }
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index ec5201b..223f23a 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -1,4 +1,4 @@
-//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//===-- MipsMCInstLower.h - Lower MachineInstr to MCInst -------------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -36,7 +36,8 @@ public:
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
-                               MachineOperandType MOTy) const;
+                               MachineOperandType MOTy, unsigned Offset) const;
+  MCOperand LowerOperand(const MachineOperand& MO) const;
 };
 }
 
diff --git a/lib/Target/Mips/MipsMCSymbolRefExpr.cpp b/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
index 9a2bdae..a0a242c 100644
--- a/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
+++ b/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
@@ -33,6 +33,11 @@ void MipsMCSymbolRefExpr::PrintImpl(raw_ostream &OS) const {
   case VK_Mips_GOTTPREL: OS << "%gottprel("; break;
   case VK_Mips_TPREL_HI: OS << "%tprel_hi("; break;
   case VK_Mips_TPREL_LO: OS << "%tprel_lo("; break;
+  case VK_Mips_GPOFF_HI: OS << "%hi(%neg(%gp_rel("; break;
+  case VK_Mips_GPOFF_LO: OS << "%lo(%neg(%gp_rel("; break;
+  case VK_Mips_GOT_DISP: OS << "%got_disp("; break;
+  case VK_Mips_GOT_PAGE: OS << "%got_page("; break;
+  case VK_Mips_GOT_OFST: OS << "%got_ofst("; break;
   }
 
   OS << *Symbol;
@@ -43,7 +48,9 @@ void MipsMCSymbolRefExpr::PrintImpl(raw_ostream &OS) const {
     OS << Offset;
   }
 
-  if (Kind != VK_Mips_None)
+  if (Kind == VK_Mips_GPOFF_HI || Kind == VK_Mips_GPOFF_LO)
+    OS << ")))";
+  else if (Kind != VK_Mips_None)
     OS << ')';
 }
 
diff --git a/lib/Target/Mips/MipsMCSymbolRefExpr.h b/lib/Target/Mips/MipsMCSymbolRefExpr.h
index 3e69596..55e85a7 100644
--- a/lib/Target/Mips/MipsMCSymbolRefExpr.h
+++ b/lib/Target/Mips/MipsMCSymbolRefExpr.h
@@ -25,7 +25,12 @@ public:
     VK_Mips_TLSGD,
     VK_Mips_GOTTPREL,
     VK_Mips_TPREL_HI,
-    VK_Mips_TPREL_LO
+    VK_Mips_TPREL_LO,
+    VK_Mips_GPOFF_HI,
+    VK_Mips_GPOFF_LO,
+    VK_Mips_GOT_DISP,
+    VK_Mips_GOT_PAGE,
+    VK_Mips_GOT_OFST
   };
 
 private:
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index dbb7a67..bc30b6b 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -51,16 +51,12 @@ private:
   mutable int DynAllocFI; // Frame index of dynamically allocated stack area.   
   unsigned MaxCallFrameSize;
 
-  /// AtomicFrameIndex - To implement atomic.swap and atomic.cmp.swap
-  /// intrinsics, it is necessary to use a temporary stack location.
-  /// This field holds the frame index of this location.
-  int AtomicFrameIndex;
 public:
   MipsFunctionInfo(MachineFunction& MF)
   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0),
     VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
     OutArgFIRange(std::make_pair(-1, 0)), GPFI(0), DynAllocFI(0),
-    MaxCallFrameSize(0), AtomicFrameIndex(-1)
+    MaxCallFrameSize(0)
   {}
 
   bool isInArgFI(int FI) const {
@@ -104,9 +100,6 @@ public:
 
   unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; }
   void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
-
-  int getAtomicFrameIndex() const { return AtomicFrameIndex; }
-  void setAtomicFrameIndex(int Index) { AtomicFrameIndex = Index; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 24390da..f8c0fda 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -44,7 +43,7 @@ using namespace llvm;
 
 MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST,
                                    const TargetInstrInfo &tii)
-  : MipsGenRegisterInfo(), Subtarget(ST), TII(tii) {}
+  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST), TII(tii) {}
 
 /// getRegisterNumbering - Given the enum value for some register, e.g.
 /// Mips::RA, return the number that it corresponds to (e.g. 31).
@@ -52,39 +51,87 @@ unsigned MipsRegisterInfo::
 getRegisterNumbering(unsigned RegEnum)
 {
   switch (RegEnum) {
-    case Mips::ZERO : case Mips::F0 : case Mips::D0 : return 0;
-    case Mips::AT   : case Mips::F1 : return 1;
-    case Mips::V0   : case Mips::F2 : case Mips::D1 : return 2;
-    case Mips::V1   : case Mips::F3 : return 3;
-    case Mips::A0   : case Mips::F4 : case Mips::D2 : return 4;
-    case Mips::A1   : case Mips::F5 : return 5;
-    case Mips::A2   : case Mips::F6 : case Mips::D3 : return 6;
-    case Mips::A3   : case Mips::F7 : return 7;
-    case Mips::T0   : case Mips::F8 : case Mips::D4 : return 8;
-    case Mips::T1   : case Mips::F9 : return 9;
-    case Mips::T2   : case Mips::F10: case Mips::D5: return 10;
-    case Mips::T3   : case Mips::F11: return 11;
-    case Mips::T4   : case Mips::F12: case Mips::D6: return 12;
-    case Mips::T5   : case Mips::F13: return 13;
-    case Mips::T6   : case Mips::F14: case Mips::D7: return 14;
-    case Mips::T7   : case Mips::F15: return 15;
-    case Mips::S0   : case Mips::F16: case Mips::D8: return 16;
-    case Mips::S1   : case Mips::F17: return 17;
-    case Mips::S2   : case Mips::F18: case Mips::D9: return 18;
-    case Mips::S3   : case Mips::F19: return 19;
-    case Mips::S4   : case Mips::F20: case Mips::D10: return 20;
-    case Mips::S5   : case Mips::F21: return 21;
-    case Mips::S6   : case Mips::F22: case Mips::D11: return 22;
-    case Mips::S7   : case Mips::F23: return 23;
-    case Mips::T8   : case Mips::F24: case Mips::D12: return 24;
-    case Mips::T9   : case Mips::F25: return 25;
-    case Mips::K0   : case Mips::F26: case Mips::D13: return 26;
-    case Mips::K1   : case Mips::F27: return 27;
-    case Mips::GP   : case Mips::F28: case Mips::D14: return 28;
-    case Mips::SP   : case Mips::F29: return 29;
-    case Mips::FP   : case Mips::F30: case Mips::D15: return 30;
-    case Mips::RA   : case Mips::F31: return 31;
-    default: llvm_unreachable("Unknown register number!");
+  case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64:
+  case Mips::D0:
+    return 0;
+  case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64:
+    return 1;
+  case Mips::V0: case Mips::V0_64: case Mips::F2: case Mips::D2_64:
+  case Mips::D1:
+    return 2;
+  case Mips::V1: case Mips::V1_64: case Mips::F3: case Mips::D3_64:
+    return 3;
+  case Mips::A0: case Mips::A0_64: case Mips::F4: case Mips::D4_64:
+  case Mips::D2:
+    return 4;
+  case Mips::A1: case Mips::A1_64: case Mips::F5: case Mips::D5_64:
+    return 5;
+  case Mips::A2: case Mips::A2_64: case Mips::F6: case Mips::D6_64:
+  case Mips::D3:
+    return 6;
+  case Mips::A3: case Mips::A3_64: case Mips::F7: case Mips::D7_64:
+    return 7;
+  case Mips::T0: case Mips::T0_64: case Mips::F8: case Mips::D8_64:
+  case Mips::D4:
+    return 8;
+  case Mips::T1: case Mips::T1_64: case Mips::F9: case Mips::D9_64:
+    return 9;
+  case Mips::T2: case Mips::T2_64: case Mips::F10: case Mips::D10_64:
+  case Mips::D5:
+    return 10;
+  case Mips::T3: case Mips::T3_64: case Mips::F11: case Mips::D11_64:
+    return 11;
+  case Mips::T4: case Mips::T4_64: case Mips::F12: case Mips::D12_64:
+  case Mips::D6:
+    return 12;
+  case Mips::T5: case Mips::T5_64: case Mips::F13: case Mips::D13_64:
+    return 13;
+  case Mips::T6: case Mips::T6_64: case Mips::F14: case Mips::D14_64:
+  case Mips::D7:
+    return 14;
+  case Mips::T7: case Mips::T7_64: case Mips::F15: case Mips::D15_64:
+    return 15;
+  case Mips::S0: case Mips::S0_64: case Mips::F16: case Mips::D16_64:
+  case Mips::D8:
+    return 16;
+  case Mips::S1: case Mips::S1_64: case Mips::F17: case Mips::D17_64:
+    return 17;
+  case Mips::S2: case Mips::S2_64: case Mips::F18: case Mips::D18_64:
+  case Mips::D9:
+    return 18;
+  case Mips::S3: case Mips::S3_64: case Mips::F19: case Mips::D19_64:
+    return 19;
+  case Mips::S4: case Mips::S4_64: case Mips::F20: case Mips::D20_64:
+  case Mips::D10:
+    return 20;
+  case Mips::S5: case Mips::S5_64: case Mips::F21: case Mips::D21_64:
+    return 21;
+  case Mips::S6: case Mips::S6_64: case Mips::F22: case Mips::D22_64:
+  case Mips::D11:
+    return 22;
+  case Mips::S7: case Mips::S7_64: case Mips::F23: case Mips::D23_64:
+    return 23;
+  case Mips::T8: case Mips::T8_64: case Mips::F24: case Mips::D24_64:
+  case Mips::D12:
+    return 24;
+  case Mips::T9: case Mips::T9_64: case Mips::F25: case Mips::D25_64:
+    return 25;
+  case Mips::K0: case Mips::K0_64: case Mips::F26: case Mips::D26_64:
+  case Mips::D13:
+    return 26;
+  case Mips::K1: case Mips::K1_64: case Mips::F27: case Mips::D27_64:
+    return 27;
+  case Mips::GP: case Mips::GP_64: case Mips::F28: case Mips::D28_64:
+  case Mips::D14:
+    return 28;
+  case Mips::SP: case Mips::SP_64: case Mips::F29: case Mips::D29_64:
+    return 29;
+  case Mips::FP: case Mips::FP_64: case Mips::F30: case Mips::D30_64:
+  case Mips::D15: 
+    return 30;
+  case Mips::RA: case Mips::RA_64: case Mips::F31: case Mips::D31_64:
+    return 31;
+  default: llvm_unreachable("Unknown register number!");
   }
   return 0; // Not reached
 }
@@ -101,7 +148,7 @@ getCalleeSavedRegs(const MachineFunction *MF) const
 {
   // Mips callee-save register range is $16-$23, $f20-$f30
   static const unsigned SingleFloatOnlyCalleeSavedRegs[] = {
-    Mips::F30, Mips::F29, Mips::F28, Mips::F27, Mips::F26,
+    Mips::F31, Mips::F30, Mips::F29, Mips::F28, Mips::F27, Mips::F26,
     Mips::F25, Mips::F24, Mips::F23, Mips::F22, Mips::F21, Mips::F20,
     Mips::RA, Mips::FP, Mips::S7, Mips::S6, Mips::S5, Mips::S4,
     Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
@@ -113,31 +160,71 @@ getCalleeSavedRegs(const MachineFunction *MF) const
     Mips::S3, Mips::S2, Mips::S1, Mips::S0, 0
   };
 
+  static const unsigned N32CalleeSavedRegs[] = {
+    Mips::D31_64, Mips::D29_64, Mips::D27_64, Mips::D25_64, Mips::D23_64,
+    Mips::D21_64,
+    Mips::RA_64, Mips::FP_64, Mips::GP_64, Mips::S7_64, Mips::S6_64,
+    Mips::S5_64, Mips::S4_64, Mips::S3_64, Mips::S2_64, Mips::S1_64,
+    Mips::S0_64, 0
+  };
+
+  static const unsigned N64CalleeSavedRegs[] = {
+    Mips::D31_64, Mips::D30_64, Mips::D29_64, Mips::D28_64, Mips::D27_64,
+    Mips::D26_64, Mips::D25_64, Mips::D24_64,
+    Mips::RA_64, Mips::FP_64, Mips::GP_64, Mips::S7_64, Mips::S6_64,
+    Mips::S5_64, Mips::S4_64, Mips::S3_64, Mips::S2_64, Mips::S1_64,
+    Mips::S0_64, 0
+  };
+
   if (Subtarget.isSingleFloat())
     return SingleFloatOnlyCalleeSavedRegs;
-  else
+  else if (!Subtarget.hasMips64())
     return Mips32CalleeSavedRegs;
+  else if (Subtarget.isABI_N32())
+    return N32CalleeSavedRegs;
+  
+  assert(Subtarget.isABI_N64());
+  return N64CalleeSavedRegs;  
 }
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
+  static const unsigned ReservedCPURegs[] = {
+    Mips::ZERO, Mips::AT, Mips::K0, Mips::K1, 
+    Mips::GP, Mips::SP, Mips::FP, Mips::RA, 0
+  };
+
+  static const unsigned ReservedCPU64Regs[] = {
+    Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64, 
+    Mips::GP_64, Mips::SP_64, Mips::FP_64, Mips::RA_64, 0
+  };
+
   BitVector Reserved(getNumRegs());
-  Reserved.set(Mips::ZERO);
-  Reserved.set(Mips::AT);
-  Reserved.set(Mips::K0);
-  Reserved.set(Mips::K1);
-  Reserved.set(Mips::GP);
-  Reserved.set(Mips::SP);
-  Reserved.set(Mips::FP);
-  Reserved.set(Mips::RA);
-  Reserved.set(Mips::F31);
-  Reserved.set(Mips::D15);
-
-  // SRV4 requires that odd register can't be used.
-  if (!Subtarget.isSingleFloat() && !Subtarget.isMips32())
-    for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2)
-      Reserved.set(FReg);
+  typedef TargetRegisterClass::iterator RegIter;
+
+  for (const unsigned *Reg = ReservedCPURegs; *Reg; ++Reg)
+    Reserved.set(*Reg);
 
+  if (Subtarget.hasMips64()) {
+    for (const unsigned *Reg = ReservedCPU64Regs; *Reg; ++Reg)
+      Reserved.set(*Reg);
+
+    // Reserve all registers in AFGR64.
+    for (RegIter Reg = Mips::AFGR64RegisterClass->begin();
+         Reg != Mips::AFGR64RegisterClass->end(); ++Reg)
+      Reserved.set(*Reg);
+  }
+  else {
+    // Reserve all registers in CPU64Regs & FGR64.
+    for (RegIter Reg = Mips::CPU64RegsRegisterClass->begin();
+         Reg != Mips::CPU64RegsRegisterClass->end(); ++Reg)
+      Reserved.set(*Reg);
+
+    for (RegIter Reg = Mips::FGR64RegisterClass->begin();
+         Reg != Mips::FGR64RegisterClass->end(); ++Reg)
+      Reserved.set(*Reg);
+  }
+  
   return Reserved;
 }
 
@@ -245,11 +332,6 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 }
 
 unsigned MipsRegisterInfo::
-getRARegister() const {
-  return Mips::RA;
-}
-
-unsigned MipsRegisterInfo::
 getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
@@ -267,12 +349,3 @@ getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int MipsRegisterInfo::
-getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return MipsGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int MipsRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return MipsGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 646369b..67e57dd 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -57,15 +57,11 @@ struct MipsRegisterInfo : public MipsGenRegisterInfo {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   /// Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   /// Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index f0db518..925ad9e 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -10,6 +10,11 @@
 //===----------------------------------------------------------------------===//
 //  Declarations that describe the MIPS register file
 //===----------------------------------------------------------------------===//
+let Namespace = "Mips" in {
+def sub_fpeven : SubRegIndex;
+def sub_fpodd  : SubRegIndex;
+def sub_32     : SubRegIndex;
+}
 
 // We have banks of 32 registers each.
 class MipsReg<string n> : Register<n> {
@@ -28,22 +33,31 @@ class MipsGPRReg<bits<5> num, string n> : MipsReg<n> {
   let Num = num;
 }
 
+// Mips 64-bit CPU Registers
+class Mips64GPRReg<bits<5> num, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<n, subregs> {
+  let Num = num;
+  let SubRegIndices = [sub_32];
+}
+
 // Mips 32-bit FPU Registers
 class FPR<bits<5> num, string n> : MipsReg<n> {
   let Num = num;
 }
 
 // Mips 64-bit (aliased) FPU Registers
-let Namespace = "Mips" in {
-def sub_fpeven : SubRegIndex;
-def sub_fpodd  : SubRegIndex;
-}
 class AFPR<bits<5> num, string n, list<Register> subregs>
   : MipsRegWithSubRegs<n, subregs> {
   let Num = num;
   let SubRegIndices = [sub_fpeven, sub_fpodd];
 }
 
+class AFPR64<bits<5> num, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<n, subregs> {
+  let Num = num;
+  let SubRegIndices = [sub_32];
+}
+
 // Mips Hardware Registers
 class HWR<bits<5> num, string n> : MipsReg<n> {
   let Num = num;
@@ -54,6 +68,7 @@ class HWR<bits<5> num, string n> : MipsReg<n> {
 //===----------------------------------------------------------------------===//
 
 let Namespace = "Mips" in {
+  // FIXME: Fix DwarfRegNum.
 
   // General Purpose Registers
   def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<[0]>;
@@ -89,6 +104,40 @@ let Namespace = "Mips" in {
   def FP   : MipsGPRReg< 30, "FP">,  DwarfRegNum<[30]>;
   def RA   : MipsGPRReg< 31, "RA">,  DwarfRegNum<[31]>;
 
+  // General Purpose 64-bit Registers
+  def ZERO_64 : Mips64GPRReg< 0, "ZERO", [ZERO]>;
+  def AT_64   : Mips64GPRReg< 1, "AT",   [AT]>;
+  def V0_64   : Mips64GPRReg< 2, "2",    [V0]>;
+  def V1_64   : Mips64GPRReg< 3, "3",    [V1]>;
+  def A0_64   : Mips64GPRReg< 4, "4",    [A0]>;
+  def A1_64   : Mips64GPRReg< 5, "5",    [A1]>;
+  def A2_64   : Mips64GPRReg< 6, "6",    [A2]>;
+  def A3_64   : Mips64GPRReg< 7, "7",    [A3]>;
+  def T0_64   : Mips64GPRReg< 8, "8",    [T0]>;
+  def T1_64   : Mips64GPRReg< 9, "9",    [T1]>;
+  def T2_64   : Mips64GPRReg< 10, "10",  [T2]>;   
+  def T3_64   : Mips64GPRReg< 11, "11",  [T3]>;   
+  def T4_64   : Mips64GPRReg< 12, "12",  [T4]>;   
+  def T5_64   : Mips64GPRReg< 13, "13",  [T5]>;   
+  def T6_64   : Mips64GPRReg< 14, "14",  [T6]>;   
+  def T7_64   : Mips64GPRReg< 15, "15",  [T7]>;   
+  def S0_64   : Mips64GPRReg< 16, "16",  [S0]>;   
+  def S1_64   : Mips64GPRReg< 17, "17",  [S1]>;   
+  def S2_64   : Mips64GPRReg< 18, "18",  [S2]>;   
+  def S3_64   : Mips64GPRReg< 19, "19",  [S3]>;   
+  def S4_64   : Mips64GPRReg< 20, "20",  [S4]>;   
+  def S5_64   : Mips64GPRReg< 21, "21",  [S5]>;   
+  def S6_64   : Mips64GPRReg< 22, "22",  [S6]>;   
+  def S7_64   : Mips64GPRReg< 23, "23",  [S7]>;
+  def T8_64   : Mips64GPRReg< 24, "24",  [T8]>;
+  def T9_64   : Mips64GPRReg< 25, "25",  [T9]>;
+  def K0_64   : Mips64GPRReg< 26, "26",  [K0]>;
+  def K1_64   : Mips64GPRReg< 27, "27",  [K1]>;
+  def GP_64   : Mips64GPRReg< 28, "GP",  [GP]>;
+  def SP_64   : Mips64GPRReg< 29, "SP",  [SP]>;
+  def FP_64   : Mips64GPRReg< 30, "FP",  [FP]>;
+  def RA_64   : Mips64GPRReg< 31, "RA",  [RA]>;
+
   /// Mips Single point precision FPU Registers
   def F0  : FPR< 0,  "F0">, DwarfRegNum<[32]>;
   def F1  : FPR< 1,  "F1">, DwarfRegNum<[33]>;
@@ -142,10 +191,49 @@ let Namespace = "Mips" in {
   def D14 : AFPR<28, "F28", [F28, F29]>;
   def D15 : AFPR<30, "F30", [F30, F31]>;
 
+  /// Mips Double point precision FPU Registers in MFP64 mode.
+  def D0_64  : AFPR64<0, "F0", [F0]>;
+  def D1_64  : AFPR64<1, "F1", [F1]>;
+  def D2_64  : AFPR64<2, "F2", [F2]>;
+  def D3_64  : AFPR64<3, "F3", [F3]>;
+  def D4_64  : AFPR64<4, "F4", [F4]>;
+  def D5_64  : AFPR64<5, "F5", [F5]>;
+  def D6_64  : AFPR64<6, "F6", [F6]>;
+  def D7_64  : AFPR64<7, "F7", [F7]>;
+  def D8_64  : AFPR64<8, "F8", [F8]>;
+  def D9_64  : AFPR64<9, "F9", [F9]>;
+  def D10_64  : AFPR64<10, "F10", [F10]>;
+  def D11_64  : AFPR64<11, "F11", [F11]>;
+  def D12_64  : AFPR64<12, "F12", [F12]>;
+  def D13_64  : AFPR64<13, "F13", [F13]>;
+  def D14_64  : AFPR64<14, "F14", [F14]>;
+  def D15_64  : AFPR64<15, "F15", [F15]>;
+  def D16_64  : AFPR64<16, "F16", [F16]>;
+  def D17_64  : AFPR64<17, "F17", [F17]>;
+  def D18_64  : AFPR64<18, "F18", [F18]>;
+  def D19_64  : AFPR64<19, "F19", [F19]>;
+  def D20_64  : AFPR64<20, "F20", [F20]>;
+  def D21_64  : AFPR64<21, "F21", [F21]>;
+  def D22_64  : AFPR64<22, "F22", [F22]>;
+  def D23_64  : AFPR64<23, "F23", [F23]>;
+  def D24_64  : AFPR64<24, "F24", [F24]>;
+  def D25_64  : AFPR64<25, "F25", [F25]>;
+  def D26_64  : AFPR64<26, "F26", [F26]>;
+  def D27_64  : AFPR64<27, "F27", [F27]>;
+  def D28_64  : AFPR64<28, "F28", [F28]>;
+  def D29_64  : AFPR64<29, "F29", [F29]>;
+  def D30_64  : AFPR64<30, "F30", [F30]>;
+  def D31_64  : AFPR64<31, "F31", [F31]>;
+
   // Hi/Lo registers
   def HI  : Register<"hi">, DwarfRegNum<[64]>;
   def LO  : Register<"lo">, DwarfRegNum<[65]>;
 
+  let SubRegIndices = [sub_32] in {
+  def HI64  : RegisterWithSubRegs<"hi", [HI]>;
+  def LO64  : RegisterWithSubRegs<"lo", [LO]>;
+  }
+
   // Status flags register
   def FCR31 : Register<"31">;
 
@@ -167,6 +255,18 @@ def CPURegs : RegisterClass<"Mips", [i32], 32, (add
   // Reserved
   ZERO, AT, K0, K1, GP, SP, FP, RA)>;
 
+def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
+  // Return Values and Arguments
+  V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
+  // Not preserved across procedure calls
+  T0_64, T1_64, T2_64, T3_64, T4_64, T5_64, T6_64, T7_64, T8_64, T9_64,
+  // Callee save
+  S0_64, S1_64, S2_64, S3_64, S4_64, S5_64, S6_64, S7_64,
+  // Reserved
+  ZERO_64, AT_64, K0_64, K1_64, GP_64, SP_64, FP_64, RA_64)> {
+  let SubRegClasses = [(CPURegs sub_32)];
+}
+
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
 // * AFGR64 - 16 32-bit even registers (32-bit FP Mode)
@@ -182,17 +282,22 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Not preserved across procedure calls
   D2, D3, D4, D5, D8, D9,
   // Callee save
-  D10, D11, D12, D13, D14,
-  // Reserved
-  D15)> {
+  D10, D11, D12, D13, D14, D15)> {
   let SubRegClasses = [(FGR32 sub_fpeven, sub_fpodd)];
 }
 
+def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)> {
+  let SubRegClasses = [(FGR32 sub_32)];
+}
+
 // Condition Register for floating point operations
 def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31)>;
 
 // Hi/Lo Registers
 def HILO : RegisterClass<"Mips", [i32], 32, (add HI, LO)>;
+def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)> {
+  let SubRegClasses = [(HILO sub_32)];
+}
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
diff --git a/lib/Target/Mips/MipsRelocations.h b/lib/Target/Mips/MipsRelocations.h
new file mode 100644
index 0000000..66d1bfd
--- /dev/null
+++ b/lib/Target/Mips/MipsRelocations.h
@@ -0,0 +1,41 @@
+//===- MipsRelocations.h - Mips Code Relocations ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file defines the Mips target-specific relocation types
+// (for relocation-model=static).
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef MIPSRELOCATIONS_H_
+#define MIPSRELOCATIONS_H_
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace Mips{
+    enum RelocationType {
+      // reloc_mips_branch - pc relative relocation for branches. The lower 18
+      // bits of the difference between the branch target and the branch
+      // instruction, shifted right by 2.
+      reloc_mips_branch = 1,
+
+      // reloc_mips_hi - upper 16 bits of the address (modified by +1 if the
+      // lower 16 bits of the address is negative).
+      reloc_mips_hi = 2,
+
+      // reloc_mips_lo - lower 16 bits of the address.
+      reloc_mips_lo = 3,
+
+      // reloc_mips_26 - lower 28 bits of the address, shifted right by 2.
+      reloc_mips_26 = 4
+    };
+  }
+}
+
+#endif /* MIPSRELOCATIONS_H_ */
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 6eee333..016d449 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "MipsSubtarget.h"
 #include "Mips.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -24,15 +24,14 @@ using namespace llvm;
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little) :
   MipsGenSubtargetInfo(TT, CPU, FS),
-  MipsArchVersion(Mips1), MipsABI(O32), IsLittle(little), IsSingleFloat(false),
-  IsFP64bit(false), IsGP64bit(false), HasVFPU(false), IsLinux(true),
-  HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false), HasMinMax(false),
-  HasSwap(false), HasBitCount(false)
+  MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little), 
+  IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
+  IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false),
+  HasMinMax(false), HasSwap(false), HasBitCount(false)
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
-    CPUName = "mips1";
-  MipsArchVersion = Mips1;
+    CPUName = "mips32r1";
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
@@ -40,23 +39,16 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
+  // Set MipsABI if it hasn't been set yet.
+  if (MipsABI == UnknownABI)
+    MipsABI = hasMips64() ? N64 : O32; 
+
+  // Check if Architecture and ABI are compatible.
+  assert(((!hasMips64() && (isABI_O32() || isABI_EABI())) ||
+          (hasMips64() && (isABI_N32() || isABI_N64()))) &&
+         "Invalid  Arch & ABI pair.");
+
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
-
-  // When only the target triple is specified and is
-  // a allegrex target, set the features. We also match
-  // big and little endian allegrex cores (dont really
-  // know if a big one exists)
-  if (TT.find("mipsallegrex") != std::string::npos ||
-      TT.find("psp") != std::string::npos) {
-    MipsABI = EABI;
-    IsSingleFloat = true;
-    MipsArchVersion = Mips2;
-    HasVFPU = true; // Enables Allegrex Vector FPU (not supported yet)
-    HasSEInReg = true;
-    HasBitCount = true;
-    HasSwap = true;
-    HasCondMov = true;
-  }
 }
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 533d4af..d9dddad 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -27,14 +27,15 @@ class StringRef;
 class MipsSubtarget : public MipsGenSubtargetInfo {
 
 public:
+  // NOTE: O64 will not be supported.
   enum MipsABIEnum {
-    O32, O64, N32, N64, EABI
+    UnknownABI, O32, N32, N64, EABI
   };
 
 protected:
 
   enum MipsArchEnum {
-    Mips1, Mips2, Mips3, Mips4, Mips32, Mips32r2
+    Mips32, Mips32r2, Mips64, Mips64r2
   };
 
   // Mips architecture version
@@ -90,6 +91,8 @@ public:
 
   /// Only O32 and EABI supported right now.
   bool isABI_EABI() const { return MipsABI == EABI; }
+  bool isABI_N64() const { return MipsABI == N64; }
+  bool isABI_N32() const { return MipsABI == N32; }
   bool isABI_O32() const { return MipsABI == O32; }
   unsigned getTargetABI() const { return MipsABI; }
 
@@ -102,9 +105,11 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool isMips1() const { return MipsArchVersion == Mips1; }
-  bool isMips32() const { return MipsArchVersion >= Mips32; }
-  bool isMips32r2() const { return MipsArchVersion == Mips32r2; }
+  bool hasMips32() const { return MipsArchVersion >= Mips32; }
+  bool hasMips32r2() const { return MipsArchVersion == Mips32r2 ||
+                                   MipsArchVersion == Mips64r2; }
+  bool hasMips64() const { return MipsArchVersion >= Mips64; }
+  bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
 
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 20b9f4e..6480da3 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -14,13 +14,15 @@
 #include "Mips.h"
 #include "MipsTargetMachine.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeMipsTarget() {
   // Register the target.
-  RegisterTargetMachine<MipsTargetMachine> X(TheMipsTarget);
+  RegisterTargetMachine<MipsebTargetMachine> X(TheMipsTarget);
   RegisterTargetMachine<MipselTargetMachine> Y(TheMipselTarget);
+  RegisterTargetMachine<Mips64ebTargetMachine> A(TheMips64Target);
+  RegisterTargetMachine<Mips64elTargetMachine> B(TheMips64elTarget);
 }
 
 // DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
@@ -31,30 +33,47 @@ extern "C" void LLVMInitializeMipsTarget() {
 // an easier handling.
 // Using CodeModel::Large enables different CALL behavior.
 MipsTargetMachine::
-MipsTargetMachine(const Target &T, const std::string &TT,
-                  const std::string &CPU, const std::string &FS,
-                  bool isLittle=false):
-  LLVMTargetMachine(T, TT, CPU, FS),
+MipsTargetMachine(const Target &T, StringRef TT,
+                  StringRef CPU, StringRef FS,
+                  Reloc::Model RM, CodeModel::Model CM,
+                  bool isLittle):
+  LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
   Subtarget(TT, CPU, FS, isLittle),
-  DataLayout(isLittle ? 
-             std::string("e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
-             std::string("E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
+  DataLayout(isLittle ?
+             (Subtarget.isABI_N64() ?
+              "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
+              "e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
+             (Subtarget.isABI_N64() ?
+              "E-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
+              "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
   InstrInfo(*this),
   FrameLowering(Subtarget),
-  TLInfo(*this), TSInfo(*this) {
-  // Abicall enables PIC by default
-  if (getRelocationModel() == Reloc::Default) {
-    if (Subtarget.isABI_O32())
-      setRelocationModel(Reloc::PIC_);
-    else
-      setRelocationModel(Reloc::Static);
-  }
+  TLInfo(*this), TSInfo(*this), JITInfo() {
 }
 
+MipsebTargetMachine::
+MipsebTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS,
+                    Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, false) {}
+
 MipselTargetMachine::
-MipselTargetMachine(const Target &T, const std::string &TT,
-                    const std::string &CPU, const std::string &FS) :
-  MipsTargetMachine(T, TT, CPU, FS, true) {}
+MipselTargetMachine(const Target &T, StringRef TT,
+                    StringRef CPU, StringRef FS,
+                    Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, true) {}
+
+Mips64ebTargetMachine::
+Mips64ebTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, false) {}
+
+Mips64elTargetMachine::
+Mips64elTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, true) {}
 
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
@@ -77,7 +96,10 @@ addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
 
 bool MipsTargetMachine::
 addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
-  PM.add(createMipsEmitGPRestorePass(*this));
+  // Do not restore $gp if target is Mips64.
+  // In N32/64, $gp is a callee-saved register.
+  if (!Subtarget.hasMips64())
+    PM.add(createMipsEmitGPRestorePass(*this));
   return true;
 }
 
@@ -86,3 +108,12 @@ addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
   PM.add(createMipsExpandPseudoPass(*this));
   return true;
 }
+
+bool MipsTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Mips.
+  PM.add(createMipsJITCodeEmitterPass(*this, JCE));
+  return false;
+}
+
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index a021af2..118ed10 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -22,6 +22,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "MipsJITInfo.h"
 
 namespace llvm {
   class formatted_raw_ostream;
@@ -33,9 +34,12 @@ namespace llvm {
     MipsFrameLowering   FrameLowering;
     MipsTargetLowering  TLInfo;
     MipsSelectionDAGInfo TSInfo;
+    MipsJITInfo JITInfo;
+
   public:
-    MipsTargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS,
+    MipsTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM,
                       bool isLittle);
 
     virtual const MipsInstrInfo   *getInstrInfo()     const
@@ -46,6 +50,9 @@ namespace llvm {
     { return &Subtarget; }
     virtual const TargetData      *getTargetData()    const
     { return &DataLayout;}
+    virtual MipsJITInfo *getJITInfo()
+    { return &JITInfo; }
+
 
     virtual const MipsRegisterInfo *getRegisterInfo()  const {
       return &InstrInfo.getRegisterInfo();
@@ -67,16 +74,47 @@ namespace llvm {
     virtual bool addPreRegAlloc(PassManagerBase &PM,
                                 CodeGenOpt::Level OptLevel);
     virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level);
+    virtual bool addCodeEmitter(PassManagerBase &PM,
+				 CodeGenOpt::Level OptLevel,
+				 JITCodeEmitter &JCE);
+
   };
 
-/// MipselTargetMachine - Mipsel target machine.
+/// MipsebTargetMachine - Mips32 big endian target machine.
+///
+class MipsebTargetMachine : public MipsTargetMachine {
+public:
+  MipsebTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
+};
+
+/// MipselTargetMachine - Mips32 little endian target machine.
 ///
 class MipselTargetMachine : public MipsTargetMachine {
 public:
-  MipselTargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS);
+  MipselTargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
 };
 
+/// Mips64ebTargetMachine - Mips64 big endian target machine.
+///
+class Mips64ebTargetMachine : public MipsTargetMachine {
+public:
+  Mips64ebTargetMachine(const Target &T, StringRef TT,
+                        StringRef CPU, StringRef FS,
+                        Reloc::Model RM, CodeModel::Model CM);
+};
+
+/// Mips64elTargetMachine - Mips64 little endian target machine.
+///
+class Mips64elTargetMachine : public MipsTargetMachine {
+public:
+  Mips64elTargetMachine(const Target &T, StringRef TT,
+                        StringRef CPU, StringRef FS,
+                        Reloc::Model RM, CodeModel::Model CM);
+};
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index cf5d1b5..05c46f5 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -79,7 +79,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
   if (Kind.isMergeable1ByteCString())
     return false;
 
-  const Type *Ty = GV->getType()->getElementType();
+  Type *Ty = GV->getType()->getElementType();
   return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
 }
 
diff --git a/lib/Target/Mips/TargetInfo/CMakeLists.txt b/lib/Target/Mips/TargetInfo/CMakeLists.txt
index 6e5d56b..5692604 100644
--- a/lib/Target/Mips/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Mips/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMMipsInfo
   MipsTargetInfo.cpp
   )
 
-add_dependencies(LLVMMipsInfo MipsCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMMipsInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMMipsInfo MipsCommonTableGen)
diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
index a8d6fe9..243632b 100644
--- a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
@@ -9,13 +9,23 @@
 
 #include "Mips.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheMipsTarget, llvm::TheMipselTarget;
+Target llvm::TheMips64Target, llvm::TheMips64elTarget;
 
 extern "C" void LLVMInitializeMipsTargetInfo() {
-  RegisterTarget<Triple::mips> X(TheMipsTarget, "mips", "Mips");
+  RegisterTarget<Triple::mips,
+        /*HasJIT=*/true> X(TheMipsTarget, "mips", "Mips");
 
-  RegisterTarget<Triple::mipsel> Y(TheMipselTarget, "mipsel", "Mipsel");
+  RegisterTarget<Triple::mipsel,
+        /*HasJIT=*/true> Y(TheMipselTarget, "mipsel", "Mipsel");
+
+  RegisterTarget<Triple::mips64,
+        /*HasJIT=*/false> A(TheMips64Target, "mips64", "Mips64 [experimental]");
+
+  RegisterTarget<Triple::mips64el,
+        /*HasJIT=*/false> B(TheMips64elTarget,
+                            "mips64el", "Mips64el [experimental]");
 }
diff --git a/lib/Target/PTX/CMakeLists.txt b/lib/Target/PTX/CMakeLists.txt
index ce08916..6e87b17 100644
--- a/lib/Target/PTX/CMakeLists.txt
+++ b/lib/Target/PTX/CMakeLists.txt
@@ -1,24 +1,44 @@
 set(LLVM_TARGET_DEFINITIONS PTX.td)
 
-tablegen(PTXGenAsmWriter.inc -gen-asm-writer)
-tablegen(PTXGenCallingConv.inc -gen-callingconv)
-tablegen(PTXGenDAGISel.inc -gen-dag-isel)
-tablegen(PTXGenInstrInfo.inc -gen-instr-info)
-tablegen(PTXGenRegisterInfo.inc -gen-register-info)
-tablegen(PTXGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(PTXGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(PTXGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(PTXGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(PTXGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(PTXGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(PTXCommonTableGen)
 
 add_llvm_target(PTXCodeGen
   PTXAsmPrinter.cpp
   PTXISelDAGToDAG.cpp
   PTXISelLowering.cpp
   PTXInstrInfo.cpp
+  PTXFPRoundingModePass.cpp
   PTXFrameLowering.cpp
   PTXMCAsmStreamer.cpp
+  PTXMCInstLower.cpp
   PTXMFInfoExtract.cpp
+  PTXParamManager.cpp
+  PTXRegAlloc.cpp
   PTXRegisterInfo.cpp
+  PTXSelectionDAGInfo.cpp
   PTXSubtarget.cpp
   PTXTargetMachine.cpp
   )
 
+add_llvm_library_dependencies(LLVMPTXCodeGen
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMPTXDesc
+  LLVMPTXInfo
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
+add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
+
diff --git a/lib/Target/PTX/InstPrinter/CMakeLists.txt b/lib/Target/PTX/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000..029d060
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/CMakeLists.txt
@@ -0,0 +1,13 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPTXAsmPrinter
+  PTXInstPrinter.cpp
+  )
+
+add_dependencies(LLVMPTXAsmPrinter PTXCommonTableGen)
+
+add_llvm_library_dependencies(LLVMPTXAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
diff --git a/lib/Target/PTX/InstPrinter/Makefile b/lib/Target/PTX/InstPrinter/Makefile
new file mode 100644
index 0000000..0ccfe44
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/PTX/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+#											The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPTXAsmPrinter
+
+# Hack: we need to include 'main' ptx target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
new file mode 100644
index 0000000..aabb404
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/PTXInstPrinter.cpp
@@ -0,0 +1,192 @@
+//===-- PTXInstPrinter.cpp - Convert PTX MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a PTX MCInst to a .ptx file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "PTXInstPrinter.h"
+#include "MCTargetDesc/PTXBaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define GET_INSTRUCTION_NAME
+#include "PTXGenAsmWriter.inc"
+
+PTXInstPrinter::PTXInstPrinter(const MCAsmInfo &MAI,
+                               const MCSubtargetInfo &STI) :
+  MCInstPrinter(MAI) {
+  // Initialize the set of available features.
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+StringRef PTXInstPrinter::getOpcodeName(unsigned Opcode) const {
+  return getInstructionName(Opcode);
+}
+
+void PTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << getRegisterName(RegNo);
+}
+
+void PTXInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
+  printPredicate(MI, O);
+  switch (MI->getOpcode()) {
+  default:
+    printInstruction(MI, O);
+    break;
+  case PTX::CALL:
+    printCall(MI, O);
+  }
+  O << ";";
+  printAnnotation(O, Annot);
+}
+
+void PTXInstPrinter::printPredicate(const MCInst *MI, raw_ostream &O) {
+  // The last two operands are the predicate operands
+  int RegIndex;
+  int OpIndex;
+
+  if (MI->getOpcode() == PTX::CALL) {
+    RegIndex = 0;
+    OpIndex  = 1;
+  } else {
+    RegIndex = MI->getNumOperands()-2;
+    OpIndex = MI->getNumOperands()-1;
+  }
+
+  int PredOp = MI->getOperand(OpIndex).getImm();
+  if (PredOp == PTXPredicate::None)
+    return;
+
+  if (PredOp == PTXPredicate::Negate)
+    O << '!';
+  else
+    O << '@';
+
+  printOperand(MI, RegIndex, O);
+}
+
+void PTXInstPrinter::printCall(const MCInst *MI, raw_ostream &O) {
+  O << "\tcall.uni\t";
+  // The first two operands are the predicate slot
+  unsigned Index = 2;
+  unsigned NumRets = MI->getOperand(Index++).getImm();
+
+  if (NumRets > 0) {
+    O << "(";
+    printOperand(MI, Index++, O);
+    for (unsigned i = 1; i < NumRets; ++i) {
+      O << ", ";
+      printOperand(MI, Index++, O);
+    }
+    O << "), ";
+  }
+
+  O << *(MI->getOperand(Index++).getExpr()) << ", (";
+
+  unsigned NumArgs = MI->getOperand(Index++).getImm();
+  if (NumArgs > 0) {
+    printOperand(MI, Index++, O);
+    for (unsigned i = 1; i < NumArgs; ++i) {
+      O << ", ";
+      printOperand(MI, Index++, O);
+    }
+  }
+  O << ")";
+}
+
+void PTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm()) {
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    double Imm = Op.getFPImm();
+    APFloat FPImm(Imm);
+    APInt FPIntImm = FPImm.bitcastToAPInt();
+    O << "0D";
+    // PTX requires us to output the full 64 bits, even if the number is zero
+    if (FPIntImm.getZExtValue() > 0) {
+      O << FPIntImm.toString(16, false);
+    } else {
+      O << "0000000000000000";
+    }
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    const MCExpr *Expr = Op.getExpr();
+    if (const MCSymbolRefExpr *SymRefExpr = dyn_cast<MCSymbolRefExpr>(Expr)) {
+      const MCSymbol &Sym = SymRefExpr->getSymbol();
+      O << Sym.getName();
+    } else {
+      O << *Op.getExpr();
+    }
+  }
+}
+
+void PTXInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  // By definition, operand OpNo+1 is an i32imm
+  const MCOperand &Op2 = MI->getOperand(OpNo+1);
+  printOperand(MI, OpNo, O);
+  if (Op2.getImm() == 0)
+    return; // don't print "+0"
+  O << "+" << Op2.getImm();
+}
+
+void PTXInstPrinter::printRoundingMode(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert (Op.isImm() && "Rounding modes must be immediate values");
+  switch (Op.getImm()) {
+  default:
+    llvm_unreachable("Unknown rounding mode!");
+  case PTXRoundingMode::RndDefault:
+    llvm_unreachable("FP rounding-mode pass did not handle instruction!");
+    break;
+  case PTXRoundingMode::RndNone:
+    // Do not print anything.
+    break;
+  case PTXRoundingMode::RndNearestEven:
+    O << ".rn";
+    break;
+  case PTXRoundingMode::RndTowardsZero:
+    O << ".rz";
+    break;
+  case PTXRoundingMode::RndNegInf:
+    O << ".rm";
+    break;
+  case PTXRoundingMode::RndPosInf:
+    O << ".rp";
+    break;
+  case PTXRoundingMode::RndApprox:
+    O << ".approx";
+    break;
+  case PTXRoundingMode::RndNearestEvenInt:
+    O << ".rni";
+    break;
+  case PTXRoundingMode::RndTowardsZeroInt:
+    O << ".rzi";
+    break;
+  case PTXRoundingMode::RndNegInfInt:
+    O << ".rmi";
+    break;
+  case PTXRoundingMode::RndPosInfInt:
+    O << ".rpi";
+    break;
+  }
+}
+
diff --git a/lib/Target/PTX/InstPrinter/PTXInstPrinter.h b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
new file mode 100644
index 0000000..86dfd48
--- /dev/null
+++ b/lib/Target/PTX/InstPrinter/PTXInstPrinter.h
@@ -0,0 +1,47 @@
+//===-- PTXInstPrinter.h - Convert PTX MCInst to assembly syntax ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints n PTX MCInst to a .ptx file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXINSTPRINTER_H
+#define PTXINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class PTXInstPrinter : public MCInstPrinter {
+public:
+  PTXInstPrinter(const MCAsmInfo &MAI, const MCSubtargetInfo &STI);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  virtual StringRef getOpcodeName(unsigned Opcode) const;
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+
+  static const char *getInstructionName(unsigned Opcode);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  void printPredicate(const MCInst *MI, raw_ostream &O);
+  void printCall(const MCInst *MI, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRoundingMode(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+}
+
+#endif
+
diff --git a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
index df0f63f..811ef4b 100644
--- a/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PTX/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,12 @@ add_llvm_library(LLVMPTXDesc
   PTXMCTargetDesc.cpp
   PTXMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMPTXDesc
+  LLVMMC
+  LLVMPTXInfo
+  LLVMPTXAsmPrinter
+  LLVMSupport
+  )
+
+add_dependencies(LLVMPTXDesc PTXCommonTableGen)
diff --git a/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
new file mode 100644
index 0000000..c6094be
--- /dev/null
+++ b/lib/Target/PTX/MCTargetDesc/PTXBaseInfo.h
@@ -0,0 +1,63 @@
+//===-- PTXBaseInfo.h - Top level definitions for PTX -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the PTX target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXBASEINFO_H
+#define PTXBASEINFO_H
+
+#include "PTXMCTargetDesc.h"
+
+namespace llvm {
+  namespace PTXStateSpace {
+    enum {
+      Global    = 0, // default to global state space
+      Constant  = 1,
+      Local     = 2,
+      Parameter = 3,
+      Shared    = 4
+    };
+  } // namespace PTXStateSpace
+
+  namespace PTXPredicate {
+    enum {
+      Normal = 0,
+      Negate = 1,
+      None   = 2
+    };
+  } // namespace PTXPredicate
+
+  /// Namespace to hold all target-specific flags.
+  namespace PTXRoundingMode {
+    // Instruction Flags
+    enum {
+      // Rounding Mode Flags
+      RndMask             = 15,
+      RndDefault          =  0, // ---
+      RndNone             =  1, // <NONE>
+      RndNearestEven      =  2, // .rn
+      RndTowardsZero      =  3, // .rz
+      RndNegInf           =  4, // .rm
+      RndPosInf           =  5, // .rp
+      RndApprox           =  6, // .approx
+      RndNearestEvenInt   =  7, // .rni
+      RndTowardsZeroInt   =  8, // .rzi
+      RndNegInfInt        =  9, // .rmi
+      RndPosInfInt        = 10  // .rpi
+    };
+  } // namespace PTXII
+} // namespace llvm
+
+#endif
+
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
index 23f70bd..a5af3b8 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
+++ b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
@@ -13,10 +13,12 @@
 
 #include "PTXMCTargetDesc.h"
 #include "PTXMCAsmInfo.h"
+#include "InstPrinter/PTXInstPrinter.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "PTXGenInstrInfo.inc"
@@ -35,9 +37,11 @@ static MCInstrInfo *createPTXMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializePTXMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo);
+static MCRegisterInfo *createPTXMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  // PTX does not have a return address register.
+  InitPTXMCRegisterInfo(X, 0);
+  return X;
 }
 
 static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -47,14 +51,45 @@ static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializePTXMCSubtargetInfo() {
+static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCInstPrinter *createPTXMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  assert(SyntaxVariant == 0 && "We only have one syntax variant");
+  return new PTXInstPrinter(MAI, STI);
+}
+
+extern "C" void LLVMInitializePTXTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target);
+  RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(ThePTX32Target, createPTXMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePTX64Target, createPTXMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(ThePTX32Target, createPTXMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePTX64Target, createPTXMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(ThePTX32Target, createPTXMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePTX64Target, createPTXMCRegisterInfo);
+
+  // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(ThePTX32Target,
                                           createPTXMCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(ThePTX64Target,
                                           createPTXMCSubtargetInfo);
-}
 
-extern "C" void LLVMInitializePTXMCAsmInfo() {
-  RegisterMCAsmInfo<PTXMCAsmInfo> X(ThePTX32Target);
-  RegisterMCAsmInfo<PTXMCAsmInfo> Y(ThePTX64Target);
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(ThePTX32Target, createPTXMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePTX64Target, createPTXMCInstPrinter);
 }
diff --git a/lib/Target/PTX/Makefile b/lib/Target/PTX/Makefile
index 93dd38a..fa09634 100644
--- a/lib/Target/PTX/Makefile
+++ b/lib/Target/PTX/Makefile
@@ -13,12 +13,11 @@ TARGET = PTX
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = PTXGenAsmWriter.inc \
-		PTXGenCallingConv.inc \
 		PTXGenDAGISel.inc \
 		PTXGenInstrInfo.inc \
 		PTXGenRegisterInfo.inc \
 		PTXGenSubtargetInfo.inc
 
-DIRS = TargetInfo MCTargetDesc
+DIRS = InstPrinter TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/PTX/PTX.h b/lib/Target/PTX/PTX.h
index 28cab24..7d46cce 100644
--- a/lib/Target/PTX/PTX.h
+++ b/lib/Target/PTX/PTX.h
@@ -15,34 +15,30 @@
 #ifndef PTX_H
 #define PTX_H
 
-#include "MCTargetDesc/PTXMCTargetDesc.h"
+#include "MCTargetDesc/PTXBaseInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+  class MachineInstr;
+  class MCInst;
+  class PTXAsmPrinter;
   class PTXTargetMachine;
   class FunctionPass;
 
-  namespace PTX {
-    enum StateSpace {
-      GLOBAL = 0, // default to global state space
-      CONSTANT = 1,
-      LOCAL = 2,
-      PARAMETER = 3,
-      SHARED = 4
-    };
-
-    enum Predicate {
-      PRED_NORMAL = 0,
-      PRED_NEGATE = 1
-    };
-  } // namespace PTX
-
   FunctionPass *createPTXISelDag(PTXTargetMachine &TM,
                                  CodeGenOpt::Level OptLevel);
 
   FunctionPass *createPTXMFInfoExtract(PTXTargetMachine &TM,
                                        CodeGenOpt::Level OptLevel);
 
+  FunctionPass *createPTXFPRoundingModePass(PTXTargetMachine &TM,
+                                            CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createPTXRegisterAllocator();
+
+  void LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                    PTXAsmPrinter &AP);
+
 } // namespace llvm;
 
 #endif // PTX_H
diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
index f6fbe9f..693bb9c 100644
--- a/lib/Target/PTX/PTX.td
+++ b/lib/Target/PTX/PTX.td
@@ -52,13 +52,13 @@ def FeatureSM12 : SubtargetFeature<"sm12", "PTXTarget", "PTX_SM_1_2",
 def FeatureSM13 : SubtargetFeature<"sm13", "PTXTarget", "PTX_SM_1_3",
                                    "Use Shader Model 1.3">;
 def FeatureSM20 : SubtargetFeature<"sm20", "PTXTarget", "PTX_SM_2_0",
-                                   "Use Shader Model 2.0">;
+                                   "Use Shader Model 2.0", [FeatureDouble]>;
 def FeatureSM21 : SubtargetFeature<"sm21", "PTXTarget", "PTX_SM_2_1",
-                                   "Use Shader Model 2.1">;
+                                   "Use Shader Model 2.1", [FeatureDouble]>;
 def FeatureSM22 : SubtargetFeature<"sm22", "PTXTarget", "PTX_SM_2_2",
-                                   "Use Shader Model 2.2">;
+                                   "Use Shader Model 2.2", [FeatureDouble]>;
 def FeatureSM23 : SubtargetFeature<"sm23", "PTXTarget", "PTX_SM_2_3",
-                                   "Use Shader Model 2.3">;
+                                   "Use Shader Model 2.3", [FeatureDouble]>;
 
 def FeatureCOMPUTE10 : SubtargetFeature<"compute10", "PTXTarget",
                                         "PTX_COMPUTE_1_0",
@@ -74,7 +74,8 @@ def FeatureCOMPUTE13 : SubtargetFeature<"compute13", "PTXTarget",
                                         "Use Compute Compatibility 1.3">;
 def FeatureCOMPUTE20 : SubtargetFeature<"compute20", "PTXTarget",
                                         "PTX_COMPUTE_2_0",
-                                        "Use Compute Compatibility 2.0">;
+                                        "Use Compute Compatibility 2.0",
+                                        [FeatureDouble]>;
 
 //===----------------------------------------------------------------------===//
 // PTX supported processors
@@ -113,12 +114,6 @@ def : Proc<"fermi",      [FeatureSM20, FeatureDouble]>;
 include "PTXRegisterInfo.td"
 
 //===----------------------------------------------------------------------===//
-// Calling Conventions
-//===----------------------------------------------------------------------===//
-
-include "PTXCallingConv.td"
-
-//===----------------------------------------------------------------------===//
 // Instruction Descriptions
 //===----------------------------------------------------------------------===//
 
@@ -127,9 +122,20 @@ include "PTXInstrInfo.td"
 def PTXInstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
+// Assembly printer
+//===----------------------------------------------------------------------===//
+// PTX uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def PTXAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
+//===----------------------------------------------------------------------===//
 // Target Declaration
 //===----------------------------------------------------------------------===//
 
 def PTX : Target {
   let InstructionSet = PTXInstrInfo;
+  let AssemblyWriters = [PTXAsmWriter];
 }
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index 2848d54..733744b 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -15,9 +15,14 @@
 #define DEBUG_TYPE "ptx-asm-printer"
 
 #include "PTX.h"
+#include "PTXAsmPrinter.h"
 #include "PTXMachineFunctionInfo.h"
+#include "PTXParamManager.h"
+#include "PTXRegisterInfo.h"
 #include "PTXTargetMachine.h"
+#include "llvm/Argument.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -28,69 +33,32 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-namespace {
-class PTXAsmPrinter : public AsmPrinter {
-public:
-  explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {}
-
-  const char *getPassName() const { return "PTX Assembly Printer"; }
-
-  bool doFinalization(Module &M);
-
-  virtual void EmitStartOfAsmFile(Module &M);
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  virtual void EmitFunctionBodyStart();
-  virtual void EmitFunctionBodyEnd() { OutStreamer.EmitRawText(Twine("}")); }
-
-  virtual void EmitInstruction(const MachineInstr *MI);
-
-  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
-  void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                       const char *Modifier = 0);
-  void printParamOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                         const char *Modifier = 0);
-  void printReturnOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                          const char *Modifier = 0); 
-  void printPredicateOperand(const MachineInstr *MI, raw_ostream &O);
-
-  unsigned GetOrCreateSourceID(StringRef FileName,
-                               StringRef DirName);
-
-  // autogen'd.
-  void printInstruction(const MachineInstr *MI, raw_ostream &OS);
-  static const char *getRegisterName(unsigned RegNo);
-
-private:
-  void EmitVariableDeclaration(const GlobalVariable *gv);
-  void EmitFunctionDeclaration();
-
-  StringMap<unsigned> SourceIdMap;
-}; // class PTXAsmPrinter
-} // namespace
-
 static const char PARAM_PREFIX[] = "__param_";
 static const char RETURN_PREFIX[] = "__ret_";
 
-static const char *getRegisterTypeName(unsigned RegNo) {
-#define TEST_REGCLS(cls, clsstr)                \
-  if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr;
+static const char *getRegisterTypeName(unsigned RegNo,
+                                       const MachineRegisterInfo& MRI) {
+  const TargetRegisterClass *TRC = MRI.getRegClass(RegNo);
+
+#define TEST_REGCLS(cls, clsstr) \
+  if (PTX::cls ## RegisterClass == TRC) return # clsstr;
+
   TEST_REGCLS(RegPred, pred);
   TEST_REGCLS(RegI16, b16);
   TEST_REGCLS(RegI32, b32);
@@ -106,16 +74,16 @@ static const char *getRegisterTypeName(unsigned RegNo) {
 static const char *getStateSpaceName(unsigned addressSpace) {
   switch (addressSpace) {
   default: llvm_unreachable("Unknown state space");
-  case PTX::GLOBAL:    return "global";
-  case PTX::CONSTANT:  return "const";
-  case PTX::LOCAL:     return "local";
-  case PTX::PARAMETER: return "param";
-  case PTX::SHARED:    return "shared";
+  case PTXStateSpace::Global:    return "global";
+  case PTXStateSpace::Constant:  return "const";
+  case PTXStateSpace::Local:     return "local";
+  case PTXStateSpace::Parameter: return "param";
+  case PTXStateSpace::Shared:    return "shared";
   }
   return NULL;
 }
 
-static const char *getTypeName(const Type* type) {
+static const char *getTypeName(Type* type) {
   while (true) {
     switch (type->getTypeID()) {
       default: llvm_unreachable("Unknown type");
@@ -130,7 +98,7 @@ static const char *getTypeName(const Type* type) {
         }
       case Type::ArrayTyID:
       case Type::PointerTyID:
-        type = dyn_cast<const SequentialType>(type)->getElementType();
+        type = dyn_cast<SequentialType>(type)->getElementType();
         break;
     }
   }
@@ -170,6 +138,7 @@ void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
 {
   const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
 
+  // Emit the PTX .version and .target attributes
   OutStreamer.EmitRawText(Twine("\t.version " + ST.getPTXVersionString()));
   OutStreamer.EmitRawText(Twine("\t.target " + ST.getTargetString() +
                                 (ST.supportsDouble() ? ""
@@ -203,177 +172,118 @@ void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
     EmitVariableDeclaration(i);
 }
 
-bool PTXAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  SetupMachineFunction(MF);
-  EmitFunctionDeclaration();
-  EmitFunctionBody();
-  return false;
-}
-
 void PTXAsmPrinter::EmitFunctionBodyStart() {
   OutStreamer.EmitRawText(Twine("{"));
 
   const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const PTXParamManager &PM = MFI->getParamManager();
+
+  // Print register definitions
+  std::string regDefs;
+  unsigned numRegs;
+
+  // pred
+  numRegs = MFI->getNumRegistersForClass(PTX::RegPredRegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .pred %p<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // i16
+  numRegs = MFI->getNumRegistersForClass(PTX::RegI16RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .b16 %rh<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // i32
+  numRegs = MFI->getNumRegistersForClass(PTX::RegI32RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .b32 %r<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // i64
+  numRegs = MFI->getNumRegistersForClass(PTX::RegI64RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .b64 %rd<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // f32
+  numRegs = MFI->getNumRegistersForClass(PTX::RegF32RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .f32 %f<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
+
+  // f64
+  numRegs = MFI->getNumRegistersForClass(PTX::RegF64RegisterClass);
+  if(numRegs > 0) {
+    regDefs += "\t.reg .f64 %fd<";
+    regDefs += utostr(numRegs);
+    regDefs += ">;\n";
+  }
 
-  // Print local variable definition
-  for (PTXMachineFunctionInfo::reg_iterator
-       i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd(); i != e; ++ i) {
-    unsigned reg = *i;
-
-    std::string def = "\t.reg .";
-    def += getRegisterTypeName(reg);
-    def += ' ';
-    def += getRegisterName(reg);
-    def += ';';
-    OutStreamer.EmitRawText(Twine(def));
+  // Local params
+  for (PTXParamManager::param_iterator i = PM.local_begin(), e = PM.local_end();
+       i != e; ++i) {
+    regDefs += "\t.param .b";
+    regDefs += utostr(PM.getParamSize(*i));
+    regDefs += " ";
+    regDefs += PM.getParamName(*i);
+    regDefs += ";\n";
   }
 
+  OutStreamer.EmitRawText(Twine(regDefs));
+
+
   const MachineFrameInfo* FrameInfo = MF->getFrameInfo();
   DEBUG(dbgs() << "Have " << FrameInfo->getNumObjects()
                << " frame object(s)\n");
   for (unsigned i = 0, e = FrameInfo->getNumObjects(); i != e; ++i) {
     DEBUG(dbgs() << "Size of object: " << FrameInfo->getObjectSize(i) << "\n");
     if (FrameInfo->getObjectSize(i) > 0) {
-      std::string def = "\t.reg .b";
-      def += utostr(FrameInfo->getObjectSize(i)*8); // Convert to bits
-      def += " s";
+      std::string def = "\t.local .align ";
+      def += utostr(FrameInfo->getObjectAlignment(i));
+      def += " .b8";
+      def += " __local";
       def += utostr(i);
+      def += "[";
+      def += utostr(FrameInfo->getObjectSize(i)); // Convert to bits
+      def += "]";
       def += ";";
       OutStreamer.EmitRawText(Twine(def));
     }
   }
-}
-
-void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  std::string str;
-  str.reserve(64);
-
-  raw_string_ostream OS(str);
-
-  DebugLoc DL = MI->getDebugLoc();
-  if (!DL.isUnknown()) {
-
-    const MDNode *S = DL.getScope(MF->getFunction()->getContext());
-
-    // This is taken from DwarfDebug.cpp, which is conveniently not a public
-    // LLVM class.
-    StringRef Fn;
-    StringRef Dir;
-    unsigned Src = 1;
-    if (S) {
-      DIDescriptor Scope(S);
-      if (Scope.isCompileUnit()) {
-        DICompileUnit CU(S);
-        Fn = CU.getFilename();
-        Dir = CU.getDirectory();
-      } else if (Scope.isFile()) {
-        DIFile F(S);
-        Fn = F.getFilename();
-        Dir = F.getDirectory();
-      } else if (Scope.isSubprogram()) {
-        DISubprogram SP(S);
-        Fn = SP.getFilename();
-        Dir = SP.getDirectory();
-      } else if (Scope.isLexicalBlock()) {
-        DILexicalBlock DB(S);
-        Fn = DB.getFilename();
-        Dir = DB.getDirectory();
-      } else
-        assert(0 && "Unexpected scope info");
-
-      Src = GetOrCreateSourceID(Fn, Dir);
-    }
-    OutStreamer.EmitDwarfLocDirective(Src, DL.getLine(), DL.getCol(),
-                                     0, 0, 0, Fn);
-
-    const MCDwarfLoc& MDL = OutContext.getCurrentDwarfLoc();
-
-    OS << "\t.loc ";
-    OS << utostr(MDL.getFileNum());
-    OS << " ";
-    OS << utostr(MDL.getLine());
-    OS << " ";
-    OS << utostr(MDL.getColumn());
-    OS << "\n";
-  }
-
-
-  // Emit predicate
-  printPredicateOperand(MI, OS);
-
-  // Write instruction to str
-  printInstruction(MI, OS);
-  OS << ';';
-  OS.flush();
-
-  StringRef strref = StringRef(str);
-  OutStreamer.EmitRawText(strref);
-}
-
-void PTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
-                                 raw_ostream &OS) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-
-  switch (MO.getType()) {
-    default:
-      llvm_unreachable("<unknown operand type>");
-      break;
-    case MachineOperand::MO_GlobalAddress:
-      OS << *Mang->getSymbol(MO.getGlobal());
-      break;
-    case MachineOperand::MO_Immediate:
-      OS << (long) MO.getImm();
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      OS << *MO.getMBB()->getSymbol();
-      break;
-    case MachineOperand::MO_Register:
-      OS << getRegisterName(MO.getReg());
-      break;
-    case MachineOperand::MO_FPImmediate:
-      APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt();
-      bool  isFloat = MO.getFPImm()->getType()->getTypeID() == Type::FloatTyID;
-      // Emit 0F for 32-bit floats and 0D for 64-bit doubles.
-      if (isFloat) {
-        OS << "0F";
-      }
-      else {
-        OS << "0D";
-      }
-      // Emit the encoded floating-point value.
-      if (constFP.getZExtValue() > 0) {
-        OS << constFP.toString(16, false);
-      }
-      else {
-        OS << "00000000";
-        // If We have a double-precision zero, pad to 8-bytes.
-        if (!isFloat) {
-          OS << "00000000";
-        }
-      }
-      break;
-  }
-}
-
-void PTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
-                                    raw_ostream &OS, const char *Modifier) {
-  printOperand(MI, opNum, OS);
 
-  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
-    return; // don't print "+0"
-
-  OS << "+";
-  printOperand(MI, opNum+1, OS);
+  //unsigned Index = 1;
+  // Print parameter passing params
+  //for (PTXMachineFunctionInfo::param_iterator
+  //     i = MFI->paramBegin(), e = MFI->paramEnd(); i != e; ++i) {
+  //  std::string def = "\t.param .b";
+  //  def += utostr(*i);
+  //  def += " __ret_";
+  //  def += utostr(Index);
+  //  Index++;
+  //  def += ";";
+  //  OutStreamer.EmitRawText(Twine(def));
+  //}
 }
 
-void PTXAsmPrinter::printParamOperand(const MachineInstr *MI, int opNum,
-                                      raw_ostream &OS, const char *Modifier) {
-  OS << PARAM_PREFIX << (int) MI->getOperand(opNum).getImm() + 1;
+void PTXAsmPrinter::EmitFunctionBodyEnd() {
+  OutStreamer.EmitRawText(Twine("}"));
 }
 
-void PTXAsmPrinter::printReturnOperand(const MachineInstr *MI, int opNum,
-                                       raw_ostream &OS, const char *Modifier) {
-  OS << RETURN_PREFIX << (int) MI->getOperand(opNum).getImm() + 1;
+void PTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MCInst TmpInst;
+  LowerPTXMachineInstrToMCInst(MI, TmpInst, *this);
+  OutStreamer.EmitInstruction(TmpInst);
 }
 
 void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
@@ -400,14 +310,14 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
   unsigned alignment = gv->getAlignment();
   if (alignment != 0) {
     decl += ".align ";
-    decl += utostr(Log2_32(gv->getAlignment()));
+    decl += utostr(gv->getAlignment());
     decl += " ";
   }
 
 
   if (PointerType::classof(gv->getType())) {
-    const PointerType* pointerTy = dyn_cast<const PointerType>(gv->getType());
-    const Type* elementTy = pointerTy->getElementType();
+    PointerType* pointerTy = dyn_cast<PointerType>(gv->getType());
+    Type* elementTy = pointerTy->getElementType();
 
     decl += ".b8 ";
     decl += gvsym->getName();
@@ -417,14 +327,14 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
     {
       assert(elementTy->isArrayTy() && "Only pointers to arrays are supported");
 
-      const ArrayType* arrayTy = dyn_cast<const ArrayType>(elementTy);
+      ArrayType* arrayTy = dyn_cast<ArrayType>(elementTy);
       elementTy = arrayTy->getElementType();
 
       unsigned numElements = arrayTy->getNumElements();
 
       while (elementTy->isArrayTy()) {
 
-        arrayTy = dyn_cast<const ArrayType>(elementTy);
+        arrayTy = dyn_cast<ArrayType>(elementTy);
         elementTy = arrayTy->getElementType();
 
         numElements *= arrayTy->getNumElements();
@@ -447,7 +357,7 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
 
     if (gv->hasInitializer())
     {
-      const Constant *C = gv->getInitializer();  
+      const Constant *C = gv->getInitializer();
       if (const ConstantArray *CA = dyn_cast<ConstantArray>(C))
       {
         decl += " = {";
@@ -484,7 +394,7 @@ void PTXAsmPrinter::EmitVariableDeclaration(const GlobalVariable *gv) {
   OutStreamer.AddBlankLine();
 }
 
-void PTXAsmPrinter::EmitFunctionDeclaration() {
+void PTXAsmPrinter::EmitFunctionEntryLabel() {
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
   if (!CurrentFnSym->isUndefined()) {
@@ -494,25 +404,39 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
   }
 
   const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const PTXParamManager &PM = MFI->getParamManager();
   const bool isKernel = MFI->isKernel();
   const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
+  const MachineRegisterInfo& MRI = MF->getRegInfo();
 
   std::string decl = isKernel ? ".entry" : ".func";
 
-  unsigned cnt = 0;
-
   if (!isKernel) {
     decl += " (";
-    for (PTXMachineFunctionInfo::ret_iterator
-         i = MFI->retRegBegin(), e = MFI->retRegEnd(), b = i;
-         i != e; ++i) {
-      if (i != b) {
-        decl += ", ";
+    if (ST.useParamSpaceForDeviceArgs()) {
+      for (PTXParamManager::param_iterator i = PM.ret_begin(), e = PM.ret_end(),
+           b = i; i != e; ++i) {
+        if (i != b) {
+          decl += ", ";
+        }
+
+        decl += ".param .b";
+        decl += utostr(PM.getParamSize(*i));
+        decl += " ";
+        decl += PM.getParamName(*i);
+      }
+    } else {
+      for (PTXMachineFunctionInfo::reg_iterator
+           i = MFI->retreg_begin(), e = MFI->retreg_end(), b = i;
+           i != e; ++i) {
+        if (i != b) {
+          decl += ", ";
+        }
+        decl += ".reg .";
+        decl += getRegisterTypeName(*i, MRI);
+        decl += " ";
+        decl += MFI->getRegisterName(*i);
       }
-      decl += ".reg .";
-      decl += getRegisterTypeName(*i);
-      decl += " ";
-      decl += getRegisterName(*i);
     }
     decl += ")";
   }
@@ -523,26 +447,65 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
 
   decl += " (";
 
-  cnt = 0;
+  const Function *F = MF->getFunction();
 
   // Print parameters
-  for (PTXMachineFunctionInfo::reg_iterator
-       i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i;
-       i != e; ++i) {
-    if (i != b) {
-      decl += ", ";
-    }
-    if (isKernel || ST.useParamSpaceForDeviceArgs()) {
+  if (isKernel || ST.useParamSpaceForDeviceArgs()) {
+    /*for (PTXParamManager::param_iterator i = PM.arg_begin(), e = PM.arg_end(),
+         b = i; i != e; ++i) {
+      if (i != b) {
+        decl += ", ";
+      }
+
       decl += ".param .b";
-      decl += utostr(*i);
+      decl += utostr(PM.getParamSize(*i));
       decl += " ";
-      decl += PARAM_PREFIX;
-      decl += utostr(++cnt);
-    } else {
+      decl += PM.getParamName(*i);
+    }*/
+    int Counter = 1;
+    for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end(),
+         b = i; i != e; ++i) {
+      if (i != b)
+        decl += ", ";
+      const Type *ArgType = (*i).getType();
+      decl += ".param .b";
+      if (ArgType->isPointerTy()) {
+        if (ST.is64Bit())
+          decl += "64";
+        else
+          decl += "32";
+      } else {
+        decl += utostr(ArgType->getPrimitiveSizeInBits());
+      }
+      if (ArgType->isPointerTy() && ST.emitPtrAttribute()) {
+        const PointerType *PtrType = dyn_cast<const PointerType>(ArgType);
+        decl += " .ptr";
+        switch (PtrType->getAddressSpace()) {
+        default:
+          llvm_unreachable("Unknown address space in argument");
+        case PTXStateSpace::Global:
+          decl += " .global";
+          break;
+        case PTXStateSpace::Shared:
+          decl += " .shared";
+          break;
+        }
+      }
+      decl += " __param_";
+      decl += utostr(Counter++);
+    }
+  } else {
+    for (PTXMachineFunctionInfo::reg_iterator
+         i = MFI->argreg_begin(), e = MFI->argreg_end(), b = i;
+         i != e; ++i) {
+      if (i != b) {
+        decl += ", ";
+      }
+
       decl += ".reg .";
-      decl += getRegisterTypeName(*i);
+      decl += getRegisterTypeName(*i, MRI);
       decl += " ";
-      decl += getRegisterName(*i);
+      decl += MFI->getRegisterName(*i);
     }
   }
   decl += ")";
@@ -550,25 +513,6 @@ void PTXAsmPrinter::EmitFunctionDeclaration() {
   OutStreamer.EmitRawText(Twine(decl));
 }
 
-void PTXAsmPrinter::
-printPredicateOperand(const MachineInstr *MI, raw_ostream &O) {
-  int i = MI->findFirstPredOperandIdx();
-  if (i == -1)
-    llvm_unreachable("missing predicate operand");
-
-  unsigned reg = MI->getOperand(i).getReg();
-  int predOp = MI->getOperand(i+1).getImm();
-
-  DEBUG(dbgs() << "predicate: (" << reg << ", " << predOp << ")\n");
-
-  if (reg != PTX::NoRegister) {
-    O << '@';
-    if (predOp == PTX::PRED_NEGATE)
-      O << '!';
-    O << getRegisterName(reg);
-  }
-}
-
 unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName,
                                             StringRef DirName) {
   // If FE did not provide a file name, then assume stdin.
@@ -596,10 +540,58 @@ unsigned PTXAsmPrinter::GetOrCreateSourceID(StringRef FileName,
   return SrcId;
 }
 
-#include "PTXGenAsmWriter.inc"
+MCOperand PTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+                                      const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand PTXAsmPrinter::lowerOperand(const MachineOperand &MO) {
+  MCOperand MCOp;
+  const PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+  const MCExpr *Expr;
+  const char *RegSymbolName;
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unknown operand type");
+  case MachineOperand::MO_Register:
+    // We create register operands as symbols, since the PTXInstPrinter class
+    // has no way to map virtual registers back to a name without some ugly
+    // hacks.
+    // FIXME: Figure out a better way to handle virtual register naming.
+    RegSymbolName = MFI->getRegisterName(MO.getReg());
+    Expr = MCSymbolRefExpr::Create(RegSymbolName, MCSymbolRefExpr::VK_None,
+                                   OutContext);
+    MCOp = MCOperand::CreateExpr(Expr);
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                 MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_FPImmediate:
+    APFloat Val = MO.getFPImm()->getValueAPF();
+    bool ignored;
+    Val.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &ignored);
+    MCOp = MCOperand::CreateFPImm(Val.convertToDouble());
+    break;
+  }
+
+  return MCOp;
+}
 
 // Force static initialization.
 extern "C" void LLVMInitializePTXAsmPrinter() {
   RegisterAsmPrinter<PTXAsmPrinter> X(ThePTX32Target);
   RegisterAsmPrinter<PTXAsmPrinter> Y(ThePTX64Target);
 }
+
diff --git a/lib/Target/PTX/PTXAsmPrinter.h b/lib/Target/PTX/PTXAsmPrinter.h
new file mode 100644
index 0000000..538c080
--- /dev/null
+++ b/lib/Target/PTX/PTXAsmPrinter.h
@@ -0,0 +1,57 @@
+//===-- PTXAsmPrinter.h - Print machine code to a PTX file ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// PTX Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXASMPRINTER_H
+#define PTXASMPRINTER_H
+
+#include "PTX.h"
+#include "PTXTargetMachine.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MCOperand;
+
+class LLVM_LIBRARY_VISIBILITY PTXAsmPrinter : public AsmPrinter {
+public:
+  explicit PTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) {}
+
+  const char *getPassName() const { return "PTX Assembly Printer"; }
+
+  bool doFinalization(Module &M);
+
+  virtual void EmitStartOfAsmFile(Module &M);
+  virtual void EmitFunctionBodyStart();
+  virtual void EmitFunctionBodyEnd();
+  virtual void EmitFunctionEntryLabel();
+  virtual void EmitInstruction(const MachineInstr *MI);
+
+  unsigned GetOrCreateSourceID(StringRef FileName,
+                               StringRef DirName);
+
+  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCOperand lowerOperand(const MachineOperand &MO);
+
+private:
+  void EmitVariableDeclaration(const GlobalVariable *gv);
+  void EmitFunctionDeclaration();
+
+  StringMap<unsigned> SourceIdMap;
+}; // class PTXAsmPrinter
+} // namespace llvm
+
+#endif
+
diff --git a/lib/Target/PTX/PTXCallingConv.td b/lib/Target/PTX/PTXCallingConv.td
deleted file mode 100644
index 3e3ff48..0000000
--- a/lib/Target/PTX/PTXCallingConv.td
+++ /dev/null
@@ -1,29 +0,0 @@
-
-//===--- PTXCallingConv.td - Calling Conventions -----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the PTX architecture.
-//
-//===----------------------------------------------------------------------===//
-
-// PTX Formal Parameter Calling Convention
-def CC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[P12, P13, P14, P15, P16, P17, P18, P19, P20, P21, P22, P23, P24, P25, P26, P27, P28, P29, P30, P31, P32, P33, P34, P35, P36, P37, P38, P39, P40, P41, P42, P43, P44, P45, P46, P47, P48, P49, P50, P51, P52, P53, P54, P55, P56, P57, P58, P59, P60, P61, P62, P63, P64, P65, P66, P67, P68, P69, P70, P71, P72, P73, P74, P75, P76, P77, P78, P79, P80, P81, P82, P83, P84, P85, P86, P87, P88, P89, P90, P91, P92, P93, P94, P95, P96, P97, P98, P99, P100, P101, P102, P103, P104, P105, P106, P107, P108, P109, P110, P111, P112, P113, P114, P115, P116, P117, P118, P119, P120, P121, P122, P123, P124, P125, P126, P127]>>,
-  CCIfType<[i16],     CCAssignToReg<[RH12, RH13, RH14, RH15, RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23, RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31, RH32, RH33, RH34, RH35, RH36, RH37, RH38, RH39, RH40, RH41, RH42, RH43, RH44, RH45, RH46, RH47, RH48, RH49, RH50, RH51, RH52, RH53, RH54, RH55, RH56, RH57, RH58, RH59, RH60, RH61, RH62, RH63, RH64, RH65, RH66, RH67, RH68, RH69, RH70, RH71, RH72, RH73, RH74, RH75, RH76, RH77, RH78, RH79, RH80, RH81, RH82, RH83, RH84, RH85, RH86, RH87, RH88, RH89, RH90, RH91, RH92, RH93, RH94, RH95, RH96, RH97, RH98, RH99, RH100, RH101, RH102, RH103, RH104, RH105, RH106, RH107, RH108, RH109, RH110, RH111, RH112, RH113, RH114, RH115, RH116, RH117, RH118, RH119, RH120, RH121, RH122, RH123, RH124, RH125, RH126, RH127]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[RD12, RD13, RD14, RD15, RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23, RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31, RD32, RD33, RD34, RD35, RD36, RD37, RD38, RD39, RD40, RD41, RD42, RD43, RD44, RD45, RD46, RD47, RD48, RD49, RD50, RD51, RD52, RD53, RD54, RD55, RD56, RD57, RD58, RD59, RD60, RD61, RD62, RD63, RD64, RD65, RD66, RD67, RD68, RD69, RD70, RD71, RD72, RD73, RD74, RD75, RD76, RD77, RD78, RD79, RD80, RD81, RD82, RD83, RD84, RD85, RD86, RD87, RD88, RD89, RD90, RD91, RD92, RD93, RD94, RD95, RD96, RD97, RD98, RD99, RD100, RD101, RD102, RD103, RD104, RD105, RD106, RD107, RD108, RD109, RD110, RD111, RD112, RD113, RD114, RD115, RD116, RD117, RD118, RD119, RD120, RD121, RD122, RD123, RD124, RD125, RD126, RD127]>>
-]>;
-
-// PTX Return Value Calling Convention
-def RetCC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[P0, P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11]>>,
-  CCIfType<[i16],     CCAssignToReg<[RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7, RH8, RH9, RH10, RH11]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7, RD8, RD9, RD10, RD11]>>
-]>;
diff --git a/lib/Target/PTX/PTXFPRoundingModePass.cpp b/lib/Target/PTX/PTXFPRoundingModePass.cpp
new file mode 100644
index 0000000..0b653e0
--- /dev/null
+++ b/lib/Target/PTX/PTXFPRoundingModePass.cpp
@@ -0,0 +1,179 @@
+//===-- PTXFPRoundingModePass.cpp - Assign rounding modes pass ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a machine function pass that sets appropriate FP rounding
+// modes for all relevant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-fp-rounding-mode"
+
+#include "PTX.h"
+#include "PTXTargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+// NOTE: PTXFPRoundingModePass should be executed just before emission.
+
+namespace llvm {
+  /// PTXFPRoundingModePass - Pass to assign appropriate FP rounding modes to
+  /// all FP instructions. Essentially, this pass just looks for all FP
+  /// instructions that have a rounding mode set to RndDefault, and sets an
+  /// appropriate rounding mode based on the target device.
+  ///
+  class PTXFPRoundingModePass : public MachineFunctionPass {
+    private:
+      static char ID;
+
+      typedef std::pair<unsigned, unsigned> RndModeDesc;
+
+      PTXTargetMachine& TargetMachine;
+      DenseMap<unsigned, RndModeDesc> Instrs;
+
+    public:
+      PTXFPRoundingModePass(PTXTargetMachine &TM, CodeGenOpt::Level OptLevel)
+        : MachineFunctionPass(ID),
+          TargetMachine(TM) {
+        initializeMap();
+      }
+
+      virtual bool runOnMachineFunction(MachineFunction &MF);
+
+      virtual const char *getPassName() const {
+        return "PTX FP Rounding Mode Pass";
+      }
+
+    private:
+
+      void initializeMap();
+      void processInstruction(MachineInstr &MI);
+  }; // class PTXFPRoundingModePass
+} // namespace llvm
+
+using namespace llvm;
+
+char PTXFPRoundingModePass::ID = 0;
+
+bool PTXFPRoundingModePass::runOnMachineFunction(MachineFunction &MF) {
+  // Look at each basic block
+  for (MachineFunction::iterator bbi = MF.begin(), bbe = MF.end(); bbi != bbe;
+       ++bbi) {
+    MachineBasicBlock &MBB = *bbi;
+    // Look at each instruction
+    for (MachineBasicBlock::iterator ii = MBB.begin(), ie = MBB.end();
+         ii != ie; ++ii) {
+      MachineInstr &MI = *ii;
+      processInstruction(MI);
+    }
+  }
+  return false;
+}
+
+void PTXFPRoundingModePass::initializeMap() {
+  using namespace PTXRoundingMode;
+  const PTXSubtarget& ST = TargetMachine.getSubtarget<PTXSubtarget>();
+
+  // Build a map of default rounding mode for all instructions that need a
+  // rounding mode.
+  Instrs[PTX::FADDrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FADDri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSUBri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FMULri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+
+  Instrs[PTX::FNEGrr32] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGri32] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGrr64] = std::make_pair(1U, (unsigned)RndNone);
+  Instrs[PTX::FNEGri64] = std::make_pair(1U, (unsigned)RndNone);
+
+  unsigned FDivRndMode = ST.fdivNeedsRoundingMode() ? RndNearestEven : RndNone;
+  Instrs[PTX::FDIVrr32] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVri32] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVrr64] = std::make_pair(1U, FDivRndMode);
+  Instrs[PTX::FDIVri64] = std::make_pair(1U, FDivRndMode);
+
+  unsigned FMADRndMode = ST.fmadNeedsRoundingMode() ? RndNearestEven : RndNone;
+  Instrs[PTX::FMADrrr32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrri32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrii32] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrrr64] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrri64] = std::make_pair(1U, FMADRndMode);
+  Instrs[PTX::FMADrii64] = std::make_pair(1U, FMADRndMode);
+
+  Instrs[PTX::FSQRTrr32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTri32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTrr64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::FSQRTri64] = std::make_pair(1U, (unsigned)RndNearestEven);
+
+  Instrs[PTX::FSINrr32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINri32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINrr64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FSINri64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSrr32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSri32] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSrr64] = std::make_pair(1U, (unsigned)RndApprox);
+  Instrs[PTX::FCOSri64] = std::make_pair(1U, (unsigned)RndApprox);
+
+  Instrs[PTX::CVTu16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs16f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs16f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs32f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs32f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs64f32] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTu64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+  Instrs[PTX::CVTs64f64] = std::make_pair(1U, (unsigned)RndTowardsZeroInt);
+
+  Instrs[PTX::CVTf32u16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32u32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32u64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32s64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf32f64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s16] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s32] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64u64] = std::make_pair(1U, (unsigned)RndNearestEven);
+  Instrs[PTX::CVTf64s64] = std::make_pair(1U, (unsigned)RndNearestEven);
+}
+
+void PTXFPRoundingModePass::processInstruction(MachineInstr &MI) {
+  // Is this an instruction that needs a rounding mode?
+  if (Instrs.count(MI.getOpcode())) {
+    const RndModeDesc &Desc = Instrs[MI.getOpcode()];
+    // Get the rounding mode operand
+    MachineOperand &Op = MI.getOperand(Desc.first);
+    // Update the rounding mode if needed
+    if (Op.getImm() == PTXRoundingMode::RndDefault) {
+      Op.setImm(Desc.second);
+    }
+  }
+}
+
+FunctionPass *llvm::createPTXFPRoundingModePass(PTXTargetMachine &TM,
+                                                CodeGenOpt::Level OptLevel) {
+  return new PTXFPRoundingModePass(TM, OptLevel);
+}
+
diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp
index 9adfa62..5c7ee29 100644
--- a/lib/Target/PTX/PTXISelDAGToDAG.cpp
+++ b/lib/Target/PTX/PTXISelDAGToDAG.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "PTX.h"
+#include "PTXMachineFunctionInfo.h"
 #include "PTXTargetMachine.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Support/Debug.h"
@@ -37,6 +39,7 @@ class PTXDAGToDAGISel : public SelectionDAGISel {
     bool SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2);
     bool SelectADDRri(SDValue &Addr, SDValue &Base, SDValue &Offset);
     bool SelectADDRii(SDValue &Addr, SDValue &Base, SDValue &Offset);
+    bool SelectADDRlocal(SDValue &Addr, SDValue &Base, SDValue &Offset);
 
     // Include the pieces auto'gened from the target description
 #include "PTXGenDAGISel.inc"
@@ -46,6 +49,10 @@ class PTXDAGToDAGISel : public SelectionDAGISel {
     // pattern (PTXbrcond bb:$d, ...) in PTXInstrInfo.td
     SDNode *SelectBRCOND(SDNode *Node);
 
+    SDNode *SelectREADPARAM(SDNode *Node);
+    SDNode *SelectWRITEPARAM(SDNode *Node);
+    SDNode *SelectFrameIndex(SDNode *Node);
+
     bool isImm(const SDValue &operand);
     bool SelectImm(const SDValue &operand, SDValue &imm);
 
@@ -68,6 +75,12 @@ SDNode *PTXDAGToDAGISel::Select(SDNode *Node) {
   switch (Node->getOpcode()) {
     case ISD::BRCOND:
       return SelectBRCOND(Node);
+    case PTXISD::READ_PARAM:
+      return SelectREADPARAM(Node);
+    case PTXISD::WRITE_PARAM:
+      return SelectWRITEPARAM(Node);
+    case ISD::FrameIndex:
+      return SelectFrameIndex(Node);
     default:
       return SelectCode(Node);
   }
@@ -79,7 +92,7 @@ SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
   SDValue Chain  = Node->getOperand(0);
   SDValue Pred   = Node->getOperand(1);
   SDValue Target = Node->getOperand(2); // branch target
-  SDValue PredOp = CurDAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::Normal, MVT::i32);
   DebugLoc dl = Node->getDebugLoc();
 
   assert(Target.getOpcode()  == ISD::BasicBlock);
@@ -90,6 +103,97 @@ SDNode *PTXDAGToDAGISel::SelectBRCOND(SDNode *Node) {
   return CurDAG->getMachineNode(PTX::BRAdp, dl, MVT::Other, Ops, 4);
 }
 
+SDNode *PTXDAGToDAGISel::SelectREADPARAM(SDNode *Node) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue Index = Node->getOperand(1);
+
+  int OpCode;
+
+  // Get the type of parameter we are reading
+  EVT VT = Node->getValueType(0);
+  assert(VT.isSimple() && "READ_PARAM only implemented for MVT types");
+
+  MVT Type = VT.getSimpleVT();
+
+  if (Type == MVT::i1)
+    OpCode = PTX::READPARAMPRED;
+  else if (Type == MVT::i16)
+    OpCode = PTX::READPARAMI16;
+  else if (Type == MVT::i32)
+    OpCode = PTX::READPARAMI32;
+  else if (Type == MVT::i64)
+    OpCode = PTX::READPARAMI64;
+  else if (Type == MVT::f32)
+    OpCode = PTX::READPARAMF32;
+  else {
+    assert(Type == MVT::f64 && "Unexpected type!");
+    OpCode = PTX::READPARAMF64;
+  }
+
+  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  SDValue Ops[] = { Index, Pred, PredOp, Chain };
+  return CurDAG->getMachineNode(OpCode, dl, VT, Ops, 4);
+}
+
+SDNode *PTXDAGToDAGISel::SelectWRITEPARAM(SDNode *Node) {
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue Value = Node->getOperand(1);
+
+  int OpCode;
+
+  //Node->dumpr(CurDAG);
+
+  // Get the type of parameter we are writing
+  EVT VT = Value->getValueType(0);
+  assert(VT.isSimple() && "WRITE_PARAM only implemented for MVT types");
+
+  MVT Type = VT.getSimpleVT();
+
+  if (Type == MVT::i1)
+    OpCode = PTX::WRITEPARAMPRED;
+  else if (Type == MVT::i16)
+    OpCode = PTX::WRITEPARAMI16;
+  else if (Type == MVT::i32)
+    OpCode = PTX::WRITEPARAMI32;
+  else if (Type == MVT::i64)
+    OpCode = PTX::WRITEPARAMI64;
+  else if (Type == MVT::f32)
+    OpCode = PTX::WRITEPARAMF32;
+  else if (Type == MVT::f64)
+    OpCode = PTX::WRITEPARAMF64;
+  else
+    llvm_unreachable("Invalid type in SelectWRITEPARAM");
+
+  SDValue Pred = CurDAG->getRegister(PTX::NoRegister, MVT::i1);
+  SDValue PredOp = CurDAG->getTargetConstant(PTXPredicate::None, MVT::i32);
+  DebugLoc dl = Node->getDebugLoc();
+
+  SDValue Ops[] = { Value, Pred, PredOp, Chain };
+  SDNode* Ret = CurDAG->getMachineNode(OpCode, dl, MVT::Other, Ops, 4);
+
+  //dbgs() << "SelectWRITEPARAM produced:\n\t";
+  //Ret->dumpr(CurDAG);
+
+  return Ret;
+}
+
+SDNode *PTXDAGToDAGISel::SelectFrameIndex(SDNode *Node) {
+  int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+  //dbgs() << "Selecting FrameIndex at index " << FI << "\n";
+  //SDValue TFI = CurDAG->getTargetFrameIndex(FI, Node->getValueType(0));
+
+  PTXMachineFunctionInfo *MFI = MF->getInfo<PTXMachineFunctionInfo>();
+
+  SDValue FrameSymbol = CurDAG->getTargetExternalSymbol(MFI->getFrameSymbol(FI),
+                                                        Node->getValueType(0));
+
+  return FrameSymbol.getNode();
+}
+
 // Match memory operand of the form [reg+reg]
 bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
   if (Addr.getOpcode() != ISD::ADD || Addr.getNumOperands() < 2 ||
@@ -107,14 +211,54 @@ bool PTXDAGToDAGISel::SelectADDRrr(SDValue &Addr, SDValue &R1, SDValue &R2) {
 // Match memory operand of the form [reg], [imm+reg], and [reg+imm]
 bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
                                    SDValue &Offset) {
-  if (Addr.getOpcode() != ISD::ADD) {
+  // FrameIndex addresses are handled separately
+  //errs() << "SelectADDRri: ";
+  //Addr.getNode()->dumpr();
+  if (isa<FrameIndexSDNode>(Addr)) {
+    //errs() << "Failure\n";
+    return false;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    Base = Addr.getOperand(0);
+    if (isa<FrameIndexSDNode>(Base)) {
+      //errs() << "Failure\n";
+      return false;
+    }
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    //errs() << "Success\n";
+    return true;
+  }
+
+  /*if (Addr.getNumOperands() == 1) {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+    errs() << "Success\n";
+    return true;
+  }*/
+
+  //errs() << "SelectADDRri fails on: ";
+  //Addr.getNode()->dumpr();
+
+  if (isImm(Addr)) {
+    //errs() << "Failure\n";
+    return false;
+  }
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+
+  //errs() << "Success\n";
+  return true;
+
+  /*if (Addr.getOpcode() != ISD::ADD) {
     // let SelectADDRii handle the [imm] case
     if (isImm(Addr))
       return false;
     // it is [reg]
 
     assert(Addr.getValueType().isSimple() && "Type must be simple");
-
     Base = Addr;
     Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
 
@@ -136,7 +280,7 @@ bool PTXDAGToDAGISel::SelectADDRri(SDValue &Addr, SDValue &Base,
     }
 
   // neither [reg+imm] nor [imm+reg]
-  return false;
+  return false;*/
 }
 
 // Match memory operand of the form [imm+imm] and [imm]
@@ -160,6 +304,36 @@ bool PTXDAGToDAGISel::SelectADDRii(SDValue &Addr, SDValue &Base,
   return false;
 }
 
+// Match memory operand of the form [reg], [imm+reg], and [reg+imm]
+bool PTXDAGToDAGISel::SelectADDRlocal(SDValue &Addr, SDValue &Base,
+                                      SDValue &Offset) {
+  //errs() << "SelectADDRlocal: ";
+  //Addr.getNode()->dumpr();
+  if (isa<FrameIndexSDNode>(Addr)) {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, Addr.getValueType().getSimpleVT());
+    //errs() << "Success\n";
+    return true;
+  }
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    Base = Addr.getOperand(0);
+    if (!isa<FrameIndexSDNode>(Base)) {
+      //errs() << "Failure\n";
+      return false;
+    }
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    //errs() << "Offset: ";
+    //Offset.getNode()->dumpr();
+    //errs() << "Success\n";
+    return true;
+  }
+
+  //errs() << "Failure\n";
+  return false;
+}
+
 bool PTXDAGToDAGISel::isImm(const SDValue &operand) {
   return ConstantSDNode::classof(operand.getNode());
 }
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index 6fcf710..3307d91 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -16,23 +16,19 @@
 #include "PTXMachineFunctionInfo.h"
 #include "PTXRegisterInfo.h"
 #include "PTXSubtarget.h"
+#include "llvm/Function.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-#include "PTXGenCallingConv.inc"
-
-//===----------------------------------------------------------------------===//
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
 
@@ -47,57 +43,58 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   addRegisterClass(MVT::f64, PTX::RegF64RegisterClass);
 
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
   setMinFunctionAlignment(2);
-  
+
   ////////////////////////////////////
   /////////// Expansion //////////////
   ////////////////////////////////////
-  
+
   // (any/zero/sign) extload => load + (any/zero/sign) extend
-  
+
   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Expand);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
-  
+
   // f32 extload => load + fextend
-  
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);  
-  
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
   // f64 truncstore => trunc + store
-  
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand); 
-  
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
   // sign_extend_inreg => sign_extend
-  
+
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
-  
+
   // br_cc => brcond
-  
+
   setOperationAction(ISD::BR_CC, MVT::Other, Expand);
 
   // select_cc => setcc
-  
+
   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
-  
+
   ////////////////////////////////////
   //////////// Legal /////////////////
   ////////////////////////////////////
-  
+
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-  
+
   ////////////////////////////////////
   //////////// Custom ////////////////
   ////////////////////////////////////
-  
+
   // customise setcc to use bitwise logic if possible
-  
+
   setOperationAction(ISD::SETCC, MVT::i1, Custom);
 
   // customize translation of memory addresses
-  
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
 
@@ -105,7 +102,7 @@ PTXTargetLowering::PTXTargetLowering(TargetMachine &TM)
   computeRegisterProperties();
 }
 
-MVT::SimpleValueType PTXTargetLowering::getSetCCResultType(EVT VT) const {
+EVT PTXTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i1;
 }
 
@@ -130,10 +127,16 @@ const char *PTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
       return "PTXISD::LOAD_PARAM";
     case PTXISD::STORE_PARAM:
       return "PTXISD::STORE_PARAM";
+    case PTXISD::READ_PARAM:
+      return "PTXISD::READ_PARAM";
+    case PTXISD::WRITE_PARAM:
+      return "PTXISD::WRITE_PARAM";
     case PTXISD::EXIT:
       return "PTXISD::EXIT";
     case PTXISD::RET:
       return "PTXISD::RET";
+    case PTXISD::CALL:
+      return "PTXISD::CALL";
   }
 }
 
@@ -149,7 +152,7 @@ SDValue PTXTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
-  // Look for X == 0, X == 1, X != 0, or X != 1  
+  // Look for X == 0, X == 1, X != 0, or X != 1
   // We can simplify these to bitwise logic
 
   if (Op1.getOpcode() == ISD::Constant &&
@@ -197,6 +200,7 @@ SDValue PTXTargetLowering::
   MachineFunction &MF = DAG.getMachineFunction();
   const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
 
   switch (CallConv) {
     default:
@@ -216,68 +220,34 @@ SDValue PTXTargetLowering::
   if (MFI->isKernel() || ST.useParamSpaceForDeviceArgs()) {
     // We just need to emit the proper LOAD_PARAM ISDs
     for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-
       assert((!MFI->isKernel() || Ins[i].VT != MVT::i1) &&
              "Kernels cannot take pred operands");
 
+      unsigned ParamSize = Ins[i].VT.getStoreSizeInBits();
+      unsigned Param = PM.addArgumentParam(ParamSize);
+      const std::string &ParamName = PM.getParamName(Param);
+      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                       MVT::Other);
       SDValue ArgValue = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
-                                     DAG.getTargetConstant(i, MVT::i32));
+                                     ParamValue);
       InVals.push_back(ArgValue);
-
-      // Instead of storing a physical register in our argument list, we just
-      // store the total size of the parameter, in bits.  The ASM printer
-      // knows how to process this.
-      MFI->addArgReg(Ins[i].VT.getStoreSizeInBits());
     }
   }
   else {
-    // For device functions, we use the PTX calling convention to do register
-    // assignments then create CopyFromReg ISDs for the allocated registers
-
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), ArgLocs,
-                   *DAG.getContext());
-
-    CCInfo.AnalyzeFormalArguments(Ins, CC_PTX);
-
-    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-
-      CCValAssign&         VA    = ArgLocs[i];
-      EVT                  RegVT = VA.getLocVT();
-      TargetRegisterClass* TRC   = 0;
-
-      assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
-
-      // Determine which register class we need
-      if (RegVT == MVT::i1) {
-        TRC = PTX::RegPredRegisterClass;
-      }
-      else if (RegVT == MVT::i16) {
-        TRC = PTX::RegI16RegisterClass;
-      }
-      else if (RegVT == MVT::i32) {
-        TRC = PTX::RegI32RegisterClass;
-      }
-      else if (RegVT == MVT::i64) {
-        TRC = PTX::RegI64RegisterClass;
-      }
-      else if (RegVT == MVT::f32) {
-        TRC = PTX::RegF32RegisterClass;
-      }
-      else if (RegVT == MVT::f64) {
-        TRC = PTX::RegF64RegisterClass;
-      }
-      else {
-        llvm_unreachable("Unknown parameter type");
-      }
+    for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+      EVT                  RegVT = Ins[i].VT;
+      TargetRegisterClass* TRC   = getRegClassFor(RegVT);
 
+      // Use a unique index in the instruction to prevent instruction folding.
+      // Yes, this is a hack.
+      SDValue Index = DAG.getTargetConstant(i, MVT::i32);
       unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
-      MF.getRegInfo().addLiveIn(VA.getLocReg(), Reg);
+      SDValue ArgValue = DAG.getNode(PTXISD::READ_PARAM, dl, RegVT, Chain,
+                                     Index);
 
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       InVals.push_back(ArgValue);
 
-      MFI->addArgReg(VA.getLocReg());
+      MFI->addArgReg(Reg);
     }
   }
 
@@ -301,41 +271,66 @@ SDValue PTXTargetLowering::
       assert(Outs.size() == 0 && "Kernel must return void.");
       return DAG.getNode(PTXISD::EXIT, dl, MVT::Other, Chain);
     case CallingConv::PTX_Device:
-      //assert(Outs.size() <= 1 && "Can at most return one value.");
+      assert(Outs.size() <= 1 && "Can at most return one value.");
       break;
   }
 
   MachineFunction& MF = DAG.getMachineFunction();
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
 
   SDValue Flag;
+  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
 
-  // Even though we could use the .param space for return arguments for
-  // device functions if SM >= 2.0 and the number of return arguments is
-  // only 1, we just always use registers since this makes the codegen
-  // easier.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-  getTargetMachine(), RVLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeReturn(Outs, RetCC_PTX);
-
-  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
-    CCValAssign& VA  = RVLocs[i];
-
-    assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
+  if (ST.useParamSpaceForDeviceArgs()) {
+    assert(Outs.size() < 2 && "Device functions can return at most one value");
+
+    if (Outs.size() == 1) {
+      unsigned ParamSize = OutVals[0].getValueType().getSizeInBits();
+      unsigned Param = PM.addReturnParam(ParamSize);
+      const std::string &ParamName = PM.getParamName(Param);
+      SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                       MVT::Other);
+      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
+                          ParamValue, OutVals[0]);
+    }
+  } else {
+    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+      EVT                  RegVT = Outs[i].VT;
+      TargetRegisterClass* TRC = 0;
 
-    unsigned Reg = VA.getLocReg();
+      // Determine which register class we need
+      if (RegVT == MVT::i1) {
+        TRC = PTX::RegPredRegisterClass;
+      }
+      else if (RegVT == MVT::i16) {
+        TRC = PTX::RegI16RegisterClass;
+      }
+      else if (RegVT == MVT::i32) {
+        TRC = PTX::RegI32RegisterClass;
+      }
+      else if (RegVT == MVT::i64) {
+        TRC = PTX::RegI64RegisterClass;
+      }
+      else if (RegVT == MVT::f32) {
+        TRC = PTX::RegF32RegisterClass;
+      }
+      else if (RegVT == MVT::f64) {
+        TRC = PTX::RegF64RegisterClass;
+      }
+      else {
+        llvm_unreachable("Unknown parameter type");
+      }
 
-    DAG.getMachineFunction().getRegInfo().addLiveOut(Reg);
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(TRC);
 
-    Chain = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i], Flag);
+      SDValue Copy = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i]/*, Flag*/);
+      SDValue OutReg = DAG.getRegister(Reg, RegVT);
 
-    // Guarantee that all emitted copies are stuck together,
-    // avoiding something bad
-    Flag = Chain.getValue(1);
+      Chain = DAG.getNode(PTXISD::WRITE_PARAM, dl, MVT::Other, Copy, OutReg);
 
-    MFI->addRetReg(Reg);
+      MFI->addRetReg(Reg);
+    }
   }
 
   if (Flag.getNode() == 0) {
@@ -345,3 +340,83 @@ SDValue PTXTargetLowering::
     return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain, Flag);
   }
 }
+
+SDValue
+PTXTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<SDValue> &OutVals,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) const {
+
+  MachineFunction& MF = DAG.getMachineFunction();
+  PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
+  PTXParamManager &PM = MFI->getParamManager();
+
+  assert(getTargetMachine().getSubtarget<PTXSubtarget>().callsAreHandled() &&
+         "Calls are not handled for the target device");
+
+  std::vector<SDValue> Ops;
+  // The layout of the ops will be [Chain, #Ins, Ins, Callee, #Outs, Outs]
+  Ops.resize(Outs.size() + Ins.size() + 4);
+
+  Ops[0] = Chain;
+
+  // Identify the callee function
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+  assert(cast<Function>(GV)->getCallingConv() == CallingConv::PTX_Device &&
+         "PTX function calls must be to PTX device functions");
+  Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
+  Ops[Ins.size()+2] = Callee;
+
+  // Generate STORE_PARAM nodes for each function argument.  In PTX, function
+  // arguments are explicitly stored into .param variables and passed as
+  // arguments. There is no register/stack-based calling convention in PTX.
+  Ops[Ins.size()+3] = DAG.getTargetConstant(OutVals.size(), MVT::i32);
+  for (unsigned i = 0; i != OutVals.size(); ++i) {
+    unsigned Size = OutVals[i].getValueType().getSizeInBits();
+    unsigned Param = PM.addLocalParam(Size);
+    const std::string &ParamName = PM.getParamName(Param);
+    SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                     MVT::Other);
+    Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, MVT::Other, Chain,
+                        ParamValue, OutVals[i]);
+    Ops[i+Ins.size()+4] = ParamValue;
+  }
+
+  std::vector<SDValue> InParams;
+
+  // Generate list of .param variables to hold the return value(s).
+  Ops[1] = DAG.getTargetConstant(Ins.size(), MVT::i32);
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    unsigned Size = Ins[i].VT.getStoreSizeInBits();
+    unsigned Param = PM.addLocalParam(Size);
+    const std::string &ParamName = PM.getParamName(Param);
+    SDValue ParamValue = DAG.getTargetExternalSymbol(ParamName.c_str(),
+                                                     MVT::Other);
+    Ops[i+2] = ParamValue;
+    InParams.push_back(ParamValue);
+  }
+
+  Ops[0] = Chain;
+
+  // Create the CALL node.
+  Chain = DAG.getNode(PTXISD::CALL, dl, MVT::Other, &Ops[0], Ops.size());
+
+  // Create the LOAD_PARAM nodes that retrieve the function return value(s).
+  for (unsigned i = 0; i < Ins.size(); ++i) {
+    SDValue Load = DAG.getNode(PTXISD::LOAD_PARAM, dl, Ins[i].VT, Chain,
+                               InParams[i]);
+    InVals.push_back(Load);
+  }
+
+  return Chain;
+}
+
+unsigned PTXTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT) {
+  // All arguments consist of one "register," regardless of the type.
+  return 1;
+}
+
diff --git a/lib/Target/PTX/PTXISelLowering.h b/lib/Target/PTX/PTXISelLowering.h
index 4318541..4d25665 100644
--- a/lib/Target/PTX/PTXISelLowering.h
+++ b/lib/Target/PTX/PTXISelLowering.h
@@ -26,9 +26,12 @@ namespace PTXISD {
     FIRST_NUMBER = ISD::BUILTIN_OP_END,
     LOAD_PARAM,
     STORE_PARAM,
+    READ_PARAM,
+    WRITE_PARAM,
     EXIT,
     RET,
-    COPY_ADDRESS
+    COPY_ADDRESS,
+    CALL
   };
 }                               // namespace PTXISD
 
@@ -60,7 +63,19 @@ class PTXTargetLowering : public TargetLowering {
                   DebugLoc dl,
                   SelectionDAG &DAG) const;
 
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<SDValue> &OutVals,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals) const;
+
+    virtual EVT getSetCCResultType(EVT VT) const;
+
+    virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT);
 
   private:
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/PTX/PTXInstrFormats.td b/lib/Target/PTX/PTXInstrFormats.td
index 8cee351..397fdc3 100644
--- a/lib/Target/PTX/PTXInstrFormats.td
+++ b/lib/Target/PTX/PTXInstrFormats.td
@@ -7,12 +7,39 @@
 //
 //===----------------------------------------------------------------------===//
 
-// PTX Predicate operand, default to (0, 0) = (zero-reg, always).
+
+// Rounding Mode Specifier
+/*class RoundingMode<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def RndDefault     : RoundingMode<0>;
+def RndNearestEven : RoundingMode<1>;
+def RndNearestZero : RoundingMode<2>;
+def RndNegInf      : RoundingMode<3>;
+def RndPosInf      : RoundingMode<4>;
+def RndApprox      : RoundingMode<5>;*/
+
+
+// Rounding Mode Operand
+def RndMode : Operand<i32> {
+  let PrintMethod = "printRoundingMode";
+}
+
+def RndDefault : PatLeaf<(i32 0)>;
+
+// PTX Predicate operand, default to (0, 0) = (zero-reg, none).
 // Leave PrintMethod empty; predicate printing is defined elsewhere.
 def pred : PredicateOperand<OtherVT, (ops RegPred, i32imm),
-                                     (ops (i1 zero_reg), (i32 0))>;
+                                     (ops (i1 zero_reg), (i32 2))>;
 
+def RndModeOperand : Operand<OtherVT> {
+  let MIOperandInfo = (ops i32imm);
+}
+
+// Instruction Types
 let Namespace = "PTX" in {
+
   class InstPTX<dag oops, dag iops, string asmstr, list<dag> pattern>
     : Instruction {
       dag OutOperandList = oops;
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index 425265a..1b947a5 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -16,10 +16,11 @@
 #include "PTX.h"
 #include "PTXInstrInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRINFO_CTOR
@@ -47,8 +48,13 @@ void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I, DebugLoc DL,
                                unsigned DstReg, unsigned SrcReg,
                                bool KillSrc) const {
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) {
-    if (map[i].cls->contains(DstReg, SrcReg)) {
+
+  const MachineRegisterInfo& MRI = MBB.getParent()->getRegInfo();
+  //assert(MRI.getRegClass(SrcReg) == MRI.getRegClass(DstReg) &&
+  //  "Invalid register copy between two register classes");
+
+  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++i) {
+    if (map[i].cls == MRI.getRegClass(DstReg)) {
       const MCInstrDesc &MCID = get(map[i].opcode);
       MachineInstr *MI = BuildMI(MBB, I, DL, MCID, DstReg).
         addReg(SrcReg, getKillRegState(KillSrc));
@@ -161,7 +167,7 @@ DefinesPredicate(MachineInstr *MI,
     return false;
 
   Pred.push_back(MO);
-  Pred.push_back(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+  Pred.push_back(MachineOperand::CreateImm(PTXPredicate::None));
   return true;
 }
 
@@ -277,7 +283,7 @@ InsertBranch(MachineBasicBlock &MBB,
     BuildMI(&MBB, DL, get(PTX::BRAdp))
       .addMBB(TBB).addReg(Cond[0].getReg()).addImm(Cond[1].getImm());
     BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL);
+      .addMBB(FBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
     return 2;
   } else if (Cond.size()) {
     BuildMI(&MBB, DL, get(PTX::BRAdp))
@@ -285,7 +291,7 @@ InsertBranch(MachineBasicBlock &MBB,
     return 1;
   } else {
     BuildMI(&MBB, DL, get(PTX::BRAd))
-      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTX::PRED_NORMAL);
+      .addMBB(TBB).addReg(PTX::NoRegister).addImm(PTXPredicate::None);
     return 1;
   }
 }
@@ -296,34 +302,7 @@ void PTXInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                      unsigned SrcReg, bool isKill, int FrameIdx,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  MachineInstr& MI = *MII;
-  DebugLoc DL = MI.getDebugLoc();
-
-  DEBUG(dbgs() << "storeRegToStackSlot: " << MI);
-
-  int OpCode;
-
-  // Select the appropriate opcode based on the register class
-  if (RC == PTX::RegI16RegisterClass) {
-    OpCode = PTX::STACKSTOREI16;
-  }  else if (RC == PTX::RegI32RegisterClass) {
-    OpCode = PTX::STACKSTOREI32;
-  }  else if (RC == PTX::RegI64RegisterClass) {
-    OpCode = PTX::STACKSTOREI32;
-  }  else if (RC == PTX::RegF32RegisterClass) {
-    OpCode = PTX::STACKSTOREF32;
-  }  else if (RC == PTX::RegF64RegisterClass) {
-    OpCode = PTX::STACKSTOREF64;
-  } else {
-    llvm_unreachable("Unknown PTX register class!");
-  }
-
-  // Build the store instruction (really a mov)
-  MachineInstrBuilder MIB = BuildMI(MBB, MII, DL, get(OpCode));
-  MIB.addFrameIndex(FrameIdx);
-  MIB.addReg(SrcReg);
-
-  AddDefaultPredicate(MIB);
+  assert(false && "storeRegToStackSlot should not be called for PTX");
 }
 
 void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
@@ -331,34 +310,7 @@ void PTXInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         unsigned DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
-  MachineInstr& MI = *MII;
-  DebugLoc DL = MI.getDebugLoc();
-
-  DEBUG(dbgs() << "loadRegToStackSlot: " << MI);
-
-  int OpCode;
-
-  // Select the appropriate opcode based on the register class
-  if (RC == PTX::RegI16RegisterClass) {
-    OpCode = PTX::STACKLOADI16;
-  } else if (RC == PTX::RegI32RegisterClass) {
-    OpCode = PTX::STACKLOADI32;
-  } else if (RC == PTX::RegI64RegisterClass) {
-    OpCode = PTX::STACKLOADI32;
-  } else if (RC == PTX::RegF32RegisterClass) {
-    OpCode = PTX::STACKLOADF32;
-  } else if (RC == PTX::RegF64RegisterClass) {
-    OpCode = PTX::STACKLOADF64;
-  } else {
-    llvm_unreachable("Unknown PTX register class!");
-  }
-
-  // Build the load instruction (really a mov)
-  MachineInstrBuilder MIB = BuildMI(MBB, MII, DL, get(OpCode));
-  MIB.addReg(DestReg);
-  MIB.addFrameIndex(FrameIdx);
-
-  AddDefaultPredicate(MIB);
+  assert(false && "loadRegFromStackSlot should not be called for PTX");
 }
 
 // static helper routines
@@ -367,7 +319,7 @@ MachineSDNode *PTXInstrInfo::
 GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
                   DebugLoc dl, EVT VT, SDValue Op1) {
   SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
   SDValue ops[] = { Op1, predReg, predOp };
   return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
 }
@@ -376,7 +328,7 @@ MachineSDNode *PTXInstrInfo::
 GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
                   DebugLoc dl, EVT VT, SDValue Op1, SDValue Op2) {
   SDValue predReg = DAG->getRegister(PTX::NoRegister, MVT::i1);
-  SDValue predOp = DAG->getTargetConstant(PTX::PRED_NORMAL, MVT::i32);
+  SDValue predOp = DAG->getTargetConstant(PTXPredicate::None, MVT::i32);
   SDValue ops[] = { Op1, Op2, predReg, predOp };
   return DAG->getMachineNode(Opcode, dl, VT, ops, array_lengthof(ops));
 }
@@ -384,7 +336,7 @@ GetPTXMachineNode(SelectionDAG *DAG, unsigned Opcode,
 void PTXInstrInfo::AddDefaultPredicate(MachineInstr *MI) {
   if (MI->findFirstPredOperandIdx() == -1) {
     MI->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
-    MI->addOperand(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+    MI->addOperand(MachineOperand::CreateImm(PTXPredicate::None));
   }
 }
 
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index 6bfe906..a3fcea9 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -21,10 +21,6 @@ include "PTXInstrFormats.td"
 // Code Generation Predicates
 //===----------------------------------------------------------------------===//
 
-// Addressing
-def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
-def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
-
 // Shader Model Support
 def FDivNeedsRoundingMode : Predicate<"getSubtarget().fdivNeedsRoundingMode()">;
 def FDivNoRoundingMode : Predicate<"!getSubtarget().fdivNeedsRoundingMode()">;
@@ -43,130 +39,19 @@ def DoesNotSupportPTX23 : Predicate<"!getSubtarget().supportsPTX23()">;
 def SupportsFMA         : Predicate<"getSubtarget().supportsFMA()">;
 def DoesNotSupportFMA   : Predicate<"!getSubtarget().supportsFMA()">;
 
-//===----------------------------------------------------------------------===//
-// Instruction Pattern Stuff
-//===----------------------------------------------------------------------===//
 
-def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::GLOBAL;
-  return false;
-}]>;
-
-def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::CONSTANT;
-  return false;
-}]>;
-
-def load_local : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::LOCAL;
-  return false;
-}]>;
-
-def load_parameter : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::PARAMETER;
-  return false;
-}]>;
-
-def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::SHARED;
-  return false;
-}]>;
-
-def store_global
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::GLOBAL;
-  return false;
-}]>;
-
-def store_local
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::LOCAL;
-  return false;
-}]>;
-
-def store_parameter
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::PARAMETER;
-  return false;
-}]>;
-
-def store_shared
-  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
-  const Value *Src;
-  const PointerType *PT;
-  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
-      (PT = dyn_cast<PointerType>(Src->getType())))
-    return PT->getAddressSpace() == PTX::SHARED;
-  return false;
-}]>;
-
-// Addressing modes.
-def ADDRrr32 : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
-def ADDRrr64 : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
-def ADDRri32 : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
-def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
-def ADDRii32 : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
-def ADDRii64 : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
-
-// Address operands
-def MEMri32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI32, i32imm);
-}
-def MEMri64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RegI64, i64imm);
-}
-def MEMii32 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i32imm, i32imm);
-}
-def MEMii64 : Operand<i64> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops i64imm, i64imm);
-}
-// The operand here does not correspond to an actual address, so we
-// can use i32 in 64-bit address modes.
-def MEMpi : Operand<i32> {
-  let PrintMethod = "printParamOperand";
-  let MIOperandInfo = (ops i32imm);
-}
-def MEMret : Operand<i32> {
-  let PrintMethod = "printReturnOperand";
-  let MIOperandInfo = (ops i32imm);
-}
+
+// def SDT_PTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+// def SDT_PTXCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// def PTXcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PTXCallSeqStart,
+//                               [SDNPHasChain, SDNPOutGlue]>;
+// def PTXcallseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_PTXCallSeqEnd,
+//                               [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def PTXcall : SDNode<"PTXISD::CALL", SDTNone,
+                     [SDNPHasChain, SDNPVariadic, SDNPOptInGlue, SDNPOutGlue]>;
+
 
 // Branch & call targets have OtherVT type.
 def brtarget   : Operand<OtherVT>;
@@ -189,87 +74,73 @@ def PTXret
 def PTXcopyaddress
   : SDNode<"PTXISD::COPY_ADDRESS", SDTypeProfile<1, 1, []>, []>;
 
-// Load/store .param space
-def PTXloadparam
-  : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
-def PTXstoreparam
-  : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>,
-           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
 
 //===----------------------------------------------------------------------===//
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
+// For floating-point instructions, we cannot just embed the pattern into the
+// instruction definition since we need to muck around with the rounding mode,
+// and I do not know how to insert constants into instructions directly from
+// pattern matches.
+
 //===- Floating-Point Instructions - 2 Operand Form -----------------------===//
-multiclass PTX_FLOAT_2OP<string opcstr, SDNode opnode> {
+multiclass PTX_FLOAT_2OP<string opcstr> {
   def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a),
-                     !strconcat(opcstr, ".f32\t$d, $a"),
-                     [(set RegF32:$d, (opnode RegF32:$a))]>;
+                     (ins RndMode:$r, RegF32:$a),
+                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
   def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins f32imm:$a),
-                     !strconcat(opcstr, ".f32\t$d, $a"),
-                     [(set RegF32:$d, (opnode fpimm:$a))]>;
+                     (ins RndMode:$r, f32imm:$a),
+                     !strconcat(opcstr, "$r.f32\t$d, $a"), []>;
   def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a),
-                     !strconcat(opcstr, ".f64\t$d, $a"),
-                     [(set RegF64:$d, (opnode RegF64:$a))]>;
+                     (ins RndMode:$r, RegF64:$a),
+                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
   def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins f64imm:$a),
-                     !strconcat(opcstr, ".f64\t$d, $a"),
-                     [(set RegF64:$d, (opnode fpimm:$a))]>;
+                     (ins RndMode:$r, f64imm:$a),
+                     !strconcat(opcstr, "$r.f64\t$d, $a"), []>;
 }
 
 //===- Floating-Point Instructions - 3 Operand Form -----------------------===//
-multiclass PTX_FLOAT_3OP<string opcstr, SDNode opnode> {
+multiclass PTX_FLOAT_3OP<string opcstr> {
   def rr32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a, RegF32:$b),
-                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
-                     [(set RegF32:$d, (opnode RegF32:$a, RegF32:$b))]>;
+                     (ins RndMode:$r, RegF32:$a, RegF32:$b),
+                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
   def ri32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a, f32imm:$b),
-                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
-                     [(set RegF32:$d, (opnode RegF32:$a, fpimm:$b))]>;
+                     (ins RndMode:$r, RegF32:$a, f32imm:$b),
+                     !strconcat(opcstr, "$r.f32\t$d, $a, $b"), []>;
   def rr64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a, RegF64:$b),
-                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
-                     [(set RegF64:$d, (opnode RegF64:$a, RegF64:$b))]>;
+                     (ins RndMode:$r, RegF64:$a, RegF64:$b),
+                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
   def ri64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a, f64imm:$b),
-                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
-                     [(set RegF64:$d, (opnode RegF64:$a, fpimm:$b))]>;
+                     (ins RndMode:$r, RegF64:$a, f64imm:$b),
+                     !strconcat(opcstr, "$r.f64\t$d, $a, $b"), []>;
 }
 
 //===- Floating-Point Instructions - 4 Operand Form -----------------------===//
-multiclass PTX_FLOAT_4OP<string opcstr, SDNode opnode1, SDNode opnode2> {
+multiclass PTX_FLOAT_4OP<string opcstr> {
   def rrr32 : InstPTX<(outs RegF32:$d),
-                      (ins RegF32:$a, RegF32:$b, RegF32:$c),
-                      !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
-                      [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a,
-                                                          RegF32:$b),
-                                                 RegF32:$c))]>;
+                      (ins RndMode:$r, RegF32:$a, RegF32:$b, RegF32:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
   def rri32 : InstPTX<(outs RegF32:$d),
-                      (ins RegF32:$a, RegF32:$b, f32imm:$c),
-                      !strconcat(opcstr, ".f32\t$d, $a, $b, $c"),
-                      [(set RegF32:$d, (opnode2 (opnode1 RegF32:$a,
-                                                          RegF32:$b),
-                                                 fpimm:$c))]>;
+                      (ins RndMode:$r, RegF32:$a, RegF32:$b, f32imm:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
+  def rii32 : InstPTX<(outs RegF32:$d),
+                      (ins RndMode:$r, RegF32:$a, f32imm:$b, f32imm:$c),
+                      !strconcat(opcstr, "$r.f32\t$d, $a, $b, $c"), []>;
   def rrr64 : InstPTX<(outs RegF64:$d),
-                      (ins RegF64:$a, RegF64:$b, RegF64:$c),
-                      !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
-                      [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a,
-                                                          RegF64:$b),
-                                                 RegF64:$c))]>;
+                      (ins RndMode:$r, RegF64:$a, RegF64:$b, RegF64:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
   def rri64 : InstPTX<(outs RegF64:$d),
-                      (ins RegF64:$a, RegF64:$b, f64imm:$c),
-                      !strconcat(opcstr, ".f64\t$d, $a, $b, $c"),
-                      [(set RegF64:$d, (opnode2 (opnode1 RegF64:$a,
-                                                          RegF64:$b),
-                                                 fpimm:$c))]>;
+                      (ins RndMode:$r, RegF64:$a, RegF64:$b, f64imm:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
+  def rii64 : InstPTX<(outs RegF64:$d),
+                      (ins RndMode:$r, RegF64:$a, f64imm:$b, f64imm:$c),
+                      !strconcat(opcstr, "$r.f64\t$d, $a, $b, $c"), []>;
 }
 
-multiclass INT3<string opcstr, SDNode opnode> {
+//===- Integer Instructions - 3 Operand Form ------------------------------===//
+multiclass PTX_INT3<string opcstr, SDNode opnode> {
   def rr16 : InstPTX<(outs RegI16:$d),
                      (ins RegI16:$a, RegI16:$b),
                      !strconcat(opcstr, ".u16\t$d, $a, $b"),
@@ -296,6 +167,35 @@ multiclass INT3<string opcstr, SDNode opnode> {
                      [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
 }
 
+//===- Integer Instructions - 3 Operand Form (Signed) ---------------------===//
+multiclass PTX_INT3_SIGNED<string opcstr, SDNode opnode> {
+  def rr16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, RegI16:$b),
+                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, RegI16:$b))]>;
+  def ri16 : InstPTX<(outs RegI16:$d),
+                     (ins RegI16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".s16\t$d, $a, $b"),
+                     [(set RegI16:$d, (opnode RegI16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, RegI32:$b),
+                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, RegI32:$b))]>;
+  def ri32 : InstPTX<(outs RegI32:$d),
+                     (ins RegI32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".s32\t$d, $a, $b"),
+                     [(set RegI32:$d, (opnode RegI32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, RegI64:$b),
+                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, RegI64:$b))]>;
+  def ri64 : InstPTX<(outs RegI64:$d),
+                     (ins RegI64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".s64\t$d, $a, $b"),
+                     [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
+}
+
+//===- Bitwise Logic Instructions - 3 Operand Form ------------------------===//
 multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
   def ripreds : InstPTX<(outs RegPred:$d),
                      (ins RegPred:$a, i1imm:$b),
@@ -331,7 +231,8 @@ multiclass PTX_LOGIC<string opcstr, SDNode opnode> {
                      [(set RegI64:$d, (opnode RegI64:$a, imm:$b))]>;
 }
 
-multiclass INT3ntnc<string opcstr, SDNode opnode> {
+//===- Integer Shift Instructions - 3 Operand Form ------------------------===//
+multiclass PTX_INT3ntnc<string opcstr, SDNode opnode> {
   def rr16 : InstPTX<(outs RegI16:$d),
                      (ins RegI16:$a, RegI16:$b),
                      !strconcat(opcstr, "16\t$d, $a, $b"),
@@ -370,6 +271,7 @@ multiclass INT3ntnc<string opcstr, SDNode opnode> {
                      [(set RegI64:$d, (opnode imm:$a, RegI64:$b))]>;
 }
 
+//===- Set Predicate Instructions (Int) - 3/4 Operand Forms ---------------===//
 multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
                         CondCode cmp, string cmpstr> {
   // TODO support 5-operand format: p|q, a, b, c
@@ -385,56 +287,77 @@ multiclass PTX_SETP_I<RegisterClass RC, string regclsname, Operand immcls,
 
   def rr_and_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_and_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
+                                     RegPred:$c))]>;
   def rr_or_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_or_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
   def rr_xor_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), RegPred:$c))]>;
   def ri_xor_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
+                                     RegPred:$c))]>;
 
   def rr_and_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, cmp),
+                                     (not RegPred:$c)))]>;
   def ri_and_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, imm:$b, cmp),
+                                     (not RegPred:$c)))]>;
   def rr_or_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, cmp),
+                                    (not RegPred:$c)))]>;
   def ri_or_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, imm:$b, cmp),
+                                    (not RegPred:$c)))]>;
   def rr_xor_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, cmp),
+                                     (not RegPred:$c)))]>;
   def ri_xor_not_r
     : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, imm:$b, cmp),
+                                     (not RegPred:$c)))]>;
 }
 
-multiclass PTX_SETP_FP<RegisterClass RC, string regclsname,
+//===- Set Predicate Instructions (FP) - 3/4 Operand Form -----------------===//
+multiclass PTX_SETP_FP<RegisterClass RC, string regclsname, Operand immcls,
                         CondCode ucmp, CondCode ocmp, string cmpstr> {
   // TODO support 5-operand format: p|q, a, b, c
 
@@ -447,137 +370,110 @@ multiclass PTX_SETP_FP<RegisterClass RC, string regclsname,
               !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
               [(set RegPred:$p, (setcc RC:$a, RC:$b, ocmp))]>;
 
+  def ri_u
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, "u.", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ucmp))]>;
+  def ri_o
+    : InstPTX<(outs RegPred:$p), (ins RC:$a, immcls:$b),
+              !strconcat("setp.", cmpstr, ".", regclsname, "\t$p, $a, $b"),
+              [(set RegPred:$p, (setcc RC:$a, fpimm:$b, ocmp))]>;
+
   def rr_and_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, "u.and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
+                                     RegPred:$c))]>;
   def rr_and_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
+                                     RegPred:$c))]>;
 
   def rr_or_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, "u.or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
   def rr_or_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, $c"),
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, $c"),
               [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
 
   def rr_xor_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
+                                     RegPred:$c))]>;
   def rr_xor_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, $c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), RegPred:$c))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, $c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
+                                     RegPred:$c))]>;
 
   def rr_and_not_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, "u.and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ucmp),
+                                     (not RegPred:$c)))]>;
   def rr_and_not_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".and.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".and.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (and (setcc RC:$a, RC:$b, ocmp),
+                                     (not RegPred:$c)))]>;
 
   def rr_or_not_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, "u.or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ucmp),
+                                    (not RegPred:$c)))]>;
   def rr_or_not_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".or.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".or.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (or (setcc RC:$a, RC:$b, ocmp),
+                                    (not RegPred:$c)))]>;
 
   def rr_xor_not_r_u
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, "u.xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, "u.xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ucmp),
+                                     (not RegPred:$c)))]>;
   def rr_xor_not_r_o
     : InstPTX<(outs RegPred:$p), (ins RC:$a, RC:$b, RegPred:$c),
-              !strconcat("setp.", cmpstr, ".xor.", regclsname, "\t$p, $a, $b, !$c"),
-              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp), (not RegPred:$c)))]>;
+              !strconcat("setp.", cmpstr, ".xor.", regclsname,
+                         "\t$p, $a, $b, !$c"),
+              [(set RegPred:$p, (xor (setcc RC:$a, RC:$b, ocmp),
+                                     (not RegPred:$c)))]>;
 }
 
-multiclass PTX_SELP<RegisterClass RC, string regclsname> {
+//===- Select Predicate Instructions - 4 Operand Form ---------------------===//
+multiclass PTX_SELP<RegisterClass RC, string regclsname, Operand immcls,
+                    SDNode immnode> {
   def rr
     : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, RC:$c),
               !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
               [(set RC:$r, (select RegPred:$a, RC:$b, RC:$c))]>;
+  def ri
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, RC:$b, immcls:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, RC:$b, immnode:$c))]>;
+  def ii
+    : InstPTX<(outs RC:$r), (ins RegPred:$a, immcls:$b, immcls:$c),
+              !strconcat("selp.", regclsname, "\t$r, $b, $c, $a"),
+              [(set RC:$r, (select RegPred:$a, immnode:$b, immnode:$c))]>;
 }
 
-multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_load> {
-  def rr32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr32:$a))]>, Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRrr64:$a))]>, Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs RC:$d),
-                     (ins MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri32:$a))]>, Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs RC:$d),
-                     (ins MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRri64:$a))]>, Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs RC:$d),
-                     (ins MEMii32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii32:$a))]>, Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs RC:$d),
-                     (ins MEMii64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
-                     [(set RC:$d, (pat_load ADDRii64:$a))]>, Requires<[Use64BitAddresses]>;
-}
-
-multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
-  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
-  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
-  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
-  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
-  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
-}
-
-multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_store> {
-  def rr32 : InstPTX<(outs),
-                     (ins RC:$d, MEMri32:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr32:$a)]>, Requires<[Use32BitAddresses]>;
-  def rr64 : InstPTX<(outs),
-                     (ins RC:$d, MEMri64:$a),
-                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                     [(pat_store RC:$d, ADDRrr64:$a)]>, Requires<[Use64BitAddresses]>;
-  def ri32 : InstPTX<(outs),
-                   (ins RC:$d, MEMri32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri32:$a)]>, Requires<[Use32BitAddresses]>;
-  def ri64 : InstPTX<(outs),
-                   (ins RC:$d, MEMri64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRri64:$a)]>, Requires<[Use64BitAddresses]>;
-  def ii32 : InstPTX<(outs),
-                   (ins RC:$d, MEMii32:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii32:$a)]>, Requires<[Use32BitAddresses]>;
-  def ii64 : InstPTX<(outs),
-                   (ins RC:$d, MEMii64:$a),
-                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
-                   [(pat_store RC:$d, ADDRii64:$a)]>, Requires<[Use64BitAddresses]>;
-}
 
-multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
-  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
-  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
-  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
-  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
-  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
-}
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -585,118 +481,61 @@ multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
 
 ///===- Integer Arithmetic Instructions -----------------------------------===//
 
-defm ADD : INT3<"add", add>;
-defm SUB : INT3<"sub", sub>;
-defm MUL : INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
-defm DIV : INT3<"div", udiv>;
-defm REM : INT3<"rem", urem>;
+defm ADD  : PTX_INT3<"add", add>;
+defm SUB  : PTX_INT3<"sub", sub>;
+defm MUL  : PTX_INT3<"mul.lo", mul>; // FIXME: Allow 32x32 -> 64 multiplies
+defm DIV  : PTX_INT3<"div", udiv>;
+defm SDIV : PTX_INT3_SIGNED<"div", sdiv>;
+defm REM  : PTX_INT3<"rem", urem>;
 
 ///===- Floating-Point Arithmetic Instructions ----------------------------===//
 
-// Standard Unary Operations
-defm FNEG : PTX_FLOAT_2OP<"neg", fneg>;
+// FNEG
+defm FNEG : PTX_FLOAT_2OP<"neg">;
 
 // Standard Binary Operations
-defm FADD : PTX_FLOAT_3OP<"add.rn", fadd>;
-defm FSUB : PTX_FLOAT_3OP<"sub.rn", fsub>;
-defm FMUL : PTX_FLOAT_3OP<"mul.rn", fmul>;
-
-// For floating-point division:
-// SM_13+ defaults to .rn for f32 and f64,
-// SM10 must *not* provide a rounding
-
-// TODO: 
-//     - Allow user selection of rounding modes for fdiv
-//     - Add support for -prec-div=false (.approx)
-
-def FDIVrr32SM13 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, RegF32:$b),
-                       "div.rn.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVri32SM13 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, f32imm:$b),
-                       "div.rn.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVrr32SM10 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, RegF32:$b),
-                       "div.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, RegF32:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-def FDIVri32SM10 : InstPTX<(outs RegF32:$d),
-                       (ins RegF32:$a, f32imm:$b),
-                       "div.f32\t$d, $a, $b",
-                       [(set RegF32:$d, (fdiv RegF32:$a, fpimm:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-
-def FDIVrr64SM13 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, RegF64:$b),
-                           "div.rn.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVri64SM13 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, f64imm:$b),
-                           "div.rn.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>,
-                   Requires<[FDivNeedsRoundingMode]>;
-def FDIVrr64SM10 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, RegF64:$b),
-                           "div.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, RegF64:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-def FDIVri64SM10 : InstPTX<(outs RegF64:$d),
-                           (ins RegF64:$a, f64imm:$b),
-                           "div.f64\t$d, $a, $b",
-                           [(set RegF64:$d, (fdiv RegF64:$a, fpimm:$b))]>,
-                   Requires<[FDivNoRoundingMode]>;
-
-
+defm FADD : PTX_FLOAT_3OP<"add">;
+defm FSUB : PTX_FLOAT_3OP<"sub">;
+defm FMUL : PTX_FLOAT_3OP<"mul">;
+defm FDIV : PTX_FLOAT_3OP<"div">;
 
 // Multi-operation hybrid instructions
+defm FMAD : PTX_FLOAT_4OP<"mad">, Requires<[SupportsFMA]>;
 
-// The selection of mad/fma is tricky.  In some cases, they are the *same*
-// instruction, but in other cases we may prefer one or the other.  Also,
-// different PTX versions differ on whether rounding mode flags are required.
-// In the short term, mad is supported on all PTX versions and we use a
-// default rounding mode no matter what shader model or PTX version.
-// TODO: Allow the rounding mode to be selectable through llc.
-defm FMADSM13 : PTX_FLOAT_4OP<"mad.rn", fmul, fadd>,
-                Requires<[FMadNeedsRoundingMode, SupportsFMA]>;
-defm FMAD : PTX_FLOAT_4OP<"mad", fmul, fadd>,
-            Requires<[FMadNoRoundingMode, SupportsFMA]>;
 
 ///===- Floating-Point Intrinsic Instructions -----------------------------===//
 
-def FSQRT32 : InstPTX<(outs RegF32:$d),
-                      (ins RegF32:$a),
-                      "sqrt.rn.f32\t$d, $a",
-                      [(set RegF32:$d, (fsqrt RegF32:$a))]>;
-
-def FSQRT64 : InstPTX<(outs RegF64:$d),
-                      (ins RegF64:$a),
-                      "sqrt.rn.f64\t$d, $a",
-                      [(set RegF64:$d, (fsqrt RegF64:$a))]>;
-
-def FSIN32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a),
-                     "sin.approx.f32\t$d, $a",
-                     [(set RegF32:$d, (fsin RegF32:$a))]>;
+// SQRT
+def FSQRTrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                        "sqrt$r.f32\t$d, $a", []>;
+def FSQRTri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                        "sqrt$r.f32\t$d, $a", []>;
+def FSQRTrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                        "sqrt$r.f64\t$d, $a", []>;
+def FSQRTri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                        "sqrt$r.f64\t$d, $a", []>;
+
+// SIN
+def FSINrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                       "sin$r.f32\t$d, $a", []>;
+def FSINri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                       "sin$r.f32\t$d, $a", []>;
+def FSINrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                       "sin$r.f64\t$d, $a", []>;
+def FSINri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                       "sin$r.f64\t$d, $a", []>;
+
+// COS
+def FCOSrr32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF32:$a),
+                       "cos$r.f32\t$d, $a", []>;
+def FCOSri32 : InstPTX<(outs RegF32:$d), (ins RndMode:$r, f32imm:$a),
+                       "cos$r.f32\t$d, $a", []>;
+def FCOSrr64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegF64:$a),
+                       "cos$r.f64\t$d, $a", []>;
+def FCOSri64 : InstPTX<(outs RegF64:$d), (ins RndMode:$r, f64imm:$a),
+                       "cos$r.f64\t$d, $a", []>;
 
-def FSIN64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a),
-                     "sin.approx.f64\t$d, $a",
-                     [(set RegF64:$d, (fsin RegF64:$a))]>;
 
-def FCOS32 : InstPTX<(outs RegF32:$d),
-                     (ins RegF32:$a),
-                     "cos.approx.f32\t$d, $a",
-                     [(set RegF32:$d, (fcos RegF32:$a))]>;
-
-def FCOS64 : InstPTX<(outs RegF64:$d),
-                     (ins RegF64:$a),
-                     "cos.approx.f64\t$d, $a",
-                     [(set RegF64:$d, (fcos RegF64:$a))]>;
 
 
 ///===- Comparison and Selection Instructions -----------------------------===//
@@ -744,35 +583,35 @@ defm SETPGEs64 : PTX_SETP_I<RegI64, "s64", i64imm, SETGE,  "ge">;
 
 // Compare f32
 
-defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", SETUEQ, SETOEQ, "eq">;
-defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", SETUNE, SETONE, "ne">;
-defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", SETULT, SETOLT, "lt">;
-defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", SETULE, SETOLE, "le">;
-defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", SETUGT, SETOGT, "gt">;
-defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", SETUGE, SETOGE, "ge">;
+defm SETPEQf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUEQ, SETOEQ, "eq">;
+defm SETPNEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUNE, SETONE, "ne">;
+defm SETPLTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULT, SETOLT, "lt">;
+defm SETPLEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETULE, SETOLE, "le">;
+defm SETPGTf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGT, SETOGT, "gt">;
+defm SETPGEf32 : PTX_SETP_FP<RegF32, "f32", f32imm, SETUGE, SETOGE, "ge">;
 
 // Compare f64
 
-defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", SETUEQ, SETOEQ, "eq">;
-defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", SETUNE, SETONE, "ne">;
-defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", SETULT, SETOLT, "lt">;
-defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", SETULE, SETOLE, "le">;
-defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", SETUGT, SETOGT, "gt">;
-defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", SETUGE, SETOGE, "ge">;
+defm SETPEQf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUEQ, SETOEQ, "eq">;
+defm SETPNEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUNE, SETONE, "ne">;
+defm SETPLTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULT, SETOLT, "lt">;
+defm SETPLEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETULE, SETOLE, "le">;
+defm SETPGTf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGT, SETOGT, "gt">;
+defm SETPGEf64 : PTX_SETP_FP<RegF64, "f64", f64imm, SETUGE, SETOGE, "ge">;
 
 // .selp
 
-defm PTX_SELPu16 : PTX_SELP<RegI16, "u16">;
-defm PTX_SELPu32 : PTX_SELP<RegI32, "u32">;
-defm PTX_SELPu64 : PTX_SELP<RegI64, "u64">;
-defm PTX_SELPf32 : PTX_SELP<RegF32, "f32">;
-defm PTX_SELPf64 : PTX_SELP<RegF64, "f64">;
+defm SELPi16 : PTX_SELP<RegI16, "u16", i16imm, imm>;
+defm SELPi32 : PTX_SELP<RegI32, "u32", i32imm, imm>;
+defm SELPi64 : PTX_SELP<RegI64, "u64", i64imm, imm>;
+defm SELPf32 : PTX_SELP<RegF32, "f32", f32imm, fpimm>;
+defm SELPf64 : PTX_SELP<RegF64, "f64", f64imm, fpimm>;
 
 ///===- Logic and Shift Instructions --------------------------------------===//
 
-defm SHL : INT3ntnc<"shl.b", PTXshl>;
-defm SRL : INT3ntnc<"shr.u", PTXsrl>;
-defm SRA : INT3ntnc<"shr.s", PTXsra>;
+defm SHL : PTX_INT3ntnc<"shl.b", PTXshl>;
+defm SRL : PTX_INT3ntnc<"shr.u", PTXsrl>;
+defm SRA : PTX_INT3ntnc<"shr.s", PTXsra>;
 
 defm AND : PTX_LOGIC<"and", and>;
 defm OR  : PTX_LOGIC<"or",  or>;
@@ -780,6 +619,24 @@ defm XOR : PTX_LOGIC<"xor", xor>;
 
 ///===- Data Movement and Conversion Instructions -------------------------===//
 
+// any_extend
+// Implement the anyext instruction in terms of the PTX cvt instructions.
+//def : Pat<(i32 (anyext RegI16:$a)), (CVT_u32_u16 RegI16:$a)>;
+//def : Pat<(i64 (anyext RegI16:$a)), (CVT_u64_u16 RegI16:$a)>;
+//def : Pat<(i64 (anyext RegI32:$a)), (CVT_u64_u32 RegI32:$a)>;
+
+// bitconvert
+// These instructions implement the bit-wise conversion between integer and
+// floating-point types.
+def MOVi32f32
+  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "mov.b32\t$d, $a", []>;
+def MOVf32i32
+  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "mov.b32\t$d, $a", []>;
+def MOVi64f64
+  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "mov.b64\t$d, $a", []>;
+def MOVf64i64
+  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "mov.b64\t$d, $a", []>;
+
 let neverHasSideEffects = 1 in {
   def MOVPREDrr
     : InstPTX<(outs RegPred:$d), (ins RegPred:$a), "mov.pred\t$d, $a", []>;
@@ -825,278 +682,332 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
               [(set RegI64:$d, (PTXcopyaddress tglobaladdr:$a))]>;
 }
 
-// Loads
-defm LDg : PTX_LD_ALL<"ld.global", load_global>;
-defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
-defm LDl : PTX_LD_ALL<"ld.local",  load_local>;
-defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
+// PTX cvt instructions
+// Note all of these may actually be used, we just define all possible patterns
+// here (that make sense).
+// FIXME: Can we collapse this somehow into a multiclass def?
+
+// To i16
+def CVTu16u32
+  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a", []>;
+def CVTu16u64
+  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a", []>;
+def CVTu16f32
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u16.f32\t$d, $a", []>;
+def CVTs16f32
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s16.f32\t$d, $a", []>;
+def CVTu16f64
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u16.f64\t$d, $a", []>;
+def CVTs16f64
+  : InstPTX<(outs RegI16:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s16.f64\t$d, $a", []>;
+
+// To i32
+def CVTu32u16
+  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a", []>;
+def CVTs32s16
+  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.s32.s16\t$d, $a", []>;
+def CVTu32u64
+  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a", []>;
+def CVTu32f32
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u32.f32\t$d, $a", []>;
+def CVTs32f32
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s32.f32\t$d, $a", []>;
+def CVTu32f64
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u32.f64\t$d, $a", []>;
+def CVTs32f64
+  : InstPTX<(outs RegI32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s32.f64\t$d, $a", []>;
+
+// To i64
+def CVTu64u16
+  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a", []>;
+def CVTs64s16
+  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.s64.s16\t$d, $a", []>;
+def CVTu64u32
+  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a", []>;
+def CVTs64s32
+  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.s64.s32\t$d, $a", []>;
+def CVTu64f32
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.u64.f32\t$d, $a", []>;
+def CVTs64f32
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF32:$a),
+            "cvt$r.s64.f32\t$d, $a", []>;
+def CVTu64f64
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.u64.f64\t$d, $a", []>;
+def CVTs64f64
+  : InstPTX<(outs RegI64:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.s64.f64\t$d, $a", []>;
+
+// To f32
+def CVTf32u16
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f32.u16\t$d, $a", []>;
+def CVTf32s16
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f32.s16\t$d, $a", []>;
+def CVTf32u32
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f32.u32\t$d, $a", []>;
+def CVTf32s32
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f32.s32\t$d, $a", []>;
+def CVTf32u64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f32.u64\t$d, $a", []>;
+def CVTf32s64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f32.s64\t$d, $a", []>;
+def CVTf32f64
+  : InstPTX<(outs RegF32:$d), (ins RndMode:$r, RegF64:$a),
+            "cvt$r.f32.f64\t$d, $a", []>;
+
+// To f64
+def CVTf64u16
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f64.u16\t$d, $a", []>;
+def CVTf64s16
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI16:$a),
+            "cvt$r.f64.s16\t$d, $a", []>;
+def CVTf64u32
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f64.u32\t$d, $a", []>;
+def CVTf64s32
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI32:$a),
+            "cvt$r.f64.s32\t$d, $a", []>;
+def CVTf64u64
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f64.u64\t$d, $a", []>;
+def CVTf64s64
+  : InstPTX<(outs RegF64:$d), (ins RndMode:$r, RegI64:$a),
+            "cvt$r.f64.s64\t$d, $a", []>;
+def CVTf64f32
+  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a", []>;
+
+  ///===- Control Flow Instructions -----------------------------------------===//
 
-// These instructions are used to load/store from the .param space for
-// device and kernel parameters
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+  def BRAd
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
+}
 
-let hasSideEffects = 1 in {
-  def LDpiPred : InstPTX<(outs RegPred:$d), (ins MEMpi:$a),
-                         "ld.param.pred\t$d, [$a]",
-                         [(set RegPred:$d, (PTXloadparam timm:$a))]>;
-  def LDpiU16  : InstPTX<(outs RegI16:$d), (ins MEMpi:$a),
-                         "ld.param.u16\t$d, [$a]",
-                         [(set RegI16:$d, (PTXloadparam timm:$a))]>;
-  def LDpiU32  : InstPTX<(outs RegI32:$d), (ins MEMpi:$a),
-                         "ld.param.u32\t$d, [$a]",
-                         [(set RegI32:$d, (PTXloadparam timm:$a))]>;
-  def LDpiU64  : InstPTX<(outs RegI64:$d), (ins MEMpi:$a),
-                         "ld.param.u64\t$d, [$a]",
-                         [(set RegI64:$d, (PTXloadparam timm:$a))]>;
-  def LDpiF32  : InstPTX<(outs RegF32:$d), (ins MEMpi:$a),
-                         "ld.param.f32\t$d, [$a]",
-                         [(set RegF32:$d, (PTXloadparam timm:$a))]>;
-  def LDpiF64  : InstPTX<(outs RegF64:$d), (ins MEMpi:$a),
-                         "ld.param.f64\t$d, [$a]",
-                         [(set RegF64:$d, (PTXloadparam timm:$a))]>;
-
-  def STpiPred : InstPTX<(outs), (ins MEMret:$d, RegPred:$a),
-                         "st.param.pred\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegPred:$a)]>;
-  def STpiU16  : InstPTX<(outs), (ins MEMret:$d, RegI16:$a),
-                         "st.param.u16\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegI16:$a)]>;
-  def STpiU32  : InstPTX<(outs), (ins MEMret:$d, RegI32:$a),
-                         "st.param.u32\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegI32:$a)]>;
-  def STpiU64  : InstPTX<(outs), (ins MEMret:$d, RegI64:$a),
-                         "st.param.u64\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegI64:$a)]>;
-  def STpiF32  : InstPTX<(outs), (ins MEMret:$d, RegF32:$a),
-                         "st.param.f32\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegF32:$a)]>;
-  def STpiF64  : InstPTX<(outs), (ins MEMret:$d, RegF64:$a),
-                         "st.param.f64\t[$d], $a",
-                         [(PTXstoreparam timm:$d, RegF64:$a)]>;
+let isBranch = 1, isTerminator = 1 in {
+  // FIXME: The pattern part is blank because I cannot (or do not yet know
+  // how to) use the first operand of PredicateOperand (a RegPred register) here
+  def BRAdp
+    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
+              [/*(brcond pred:$_p, bb:$d)*/]>;
 }
 
-// Stores
-defm STg : PTX_ST_ALL<"st.global", store_global>;
-defm STl : PTX_ST_ALL<"st.local",  store_local>;
-defm STs : PTX_ST_ALL<"st.shared", store_shared>;
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
+  def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
+}
 
-// defm STp : PTX_ST_ALL<"st.param",  store_parameter>;
-// defm LDp : PTX_LD_ALL<"ld.param",  load_parameter>;
-// TODO: Do something with st.param if/when it is needed.
+let hasSideEffects = 1 in {
+  def CALL : InstPTX<(outs), (ins), "call", [(PTXcall)]>;
+}
 
-// Conversion to pred
-// PTX does not directly support converting to a predicate type, so we fake it
-// by performing a greater-than test between the value and zero.  This follows
-// the C convention that any non-zero value is equivalent to 'true'.
-def CVT_pred_u16
-  : InstPTX<(outs RegPred:$d), (ins RegI16:$a), "setp.gt.u16\t$d, $a, 0",
-            [(set RegPred:$d, (trunc RegI16:$a))]>;
+///===- Parameter Passing Pseudo-Instructions -----------------------------===//
+
+def READPARAMPRED : InstPTX<(outs RegPred:$a), (ins i32imm:$b),
+                            "mov.pred\t$a, %param$b", []>;
+def READPARAMI16  : InstPTX<(outs RegI16:$a), (ins i32imm:$b),
+                            "mov.b16\t$a, %param$b", []>;
+def READPARAMI32  : InstPTX<(outs RegI32:$a), (ins i32imm:$b),
+                            "mov.b32\t$a, %param$b", []>;
+def READPARAMI64  : InstPTX<(outs RegI64:$a), (ins i32imm:$b),
+                            "mov.b64\t$a, %param$b", []>;
+def READPARAMF32  : InstPTX<(outs RegF32:$a), (ins i32imm:$b),
+                            "mov.f32\t$a, %param$b", []>;
+def READPARAMF64  : InstPTX<(outs RegF64:$a), (ins i32imm:$b),
+                            "mov.f64\t$a, %param$b", []>;
+
+def WRITEPARAMPRED : InstPTX<(outs), (ins RegPred:$a), "//w", []>;
+def WRITEPARAMI16  : InstPTX<(outs), (ins RegI16:$a), "//w", []>;
+def WRITEPARAMI32  : InstPTX<(outs), (ins RegI32:$a), "//w", []>;
+def WRITEPARAMI64  : InstPTX<(outs), (ins RegI64:$a), "//w", []>;
+def WRITEPARAMF32  : InstPTX<(outs), (ins RegF32:$a), "//w", []>;
+def WRITEPARAMF64  : InstPTX<(outs), (ins RegF64:$a), "//w", []>;
 
-def CVT_pred_u32
-  : InstPTX<(outs RegPred:$d), (ins RegI32:$a), "setp.gt.u32\t$d, $a, 0",
-            [(set RegPred:$d, (trunc RegI32:$a))]>;
 
-def CVT_pred_u64
-  : InstPTX<(outs RegPred:$d), (ins RegI64:$a), "setp.gt.u64\t$d, $a, 0",
-            [(set RegPred:$d, (trunc RegI64:$a))]>;
+//===----------------------------------------------------------------------===//
+// Instruction Selection Patterns
+//===----------------------------------------------------------------------===//
 
-def CVT_pred_f32
-  : InstPTX<(outs RegPred:$d), (ins RegF32:$a), "setp.gt.f32\t$d, $a, 0",
-            [(set RegPred:$d, (fp_to_uint RegF32:$a))]>;
+// FADD
+def : Pat<(f32 (fadd RegF32:$a, RegF32:$b)),
+          (FADDrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fadd RegF32:$a, fpimm:$b)),
+          (FADDri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fadd RegF64:$a, RegF64:$b)),
+          (FADDrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fadd RegF64:$a, fpimm:$b)),
+          (FADDri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FSUB
+def : Pat<(f32 (fsub RegF32:$a, RegF32:$b)),
+          (FSUBrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fsub RegF32:$a, fpimm:$b)),
+          (FSUBri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fsub RegF64:$a, RegF64:$b)),
+          (FSUBrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fsub RegF64:$a, fpimm:$b)),
+          (FSUBri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FMUL
+def : Pat<(f32 (fmul RegF32:$a, RegF32:$b)),
+          (FMULrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fmul RegF32:$a, fpimm:$b)),
+          (FMULri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fmul RegF64:$a, RegF64:$b)),
+          (FMULrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fmul RegF64:$a, fpimm:$b)),
+          (FMULri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FDIV
+def : Pat<(f32 (fdiv RegF32:$a, RegF32:$b)),
+          (FDIVrr32 RndDefault, RegF32:$a, RegF32:$b)>;
+def : Pat<(f32 (fdiv RegF32:$a, fpimm:$b)),
+          (FDIVri32 RndDefault, RegF32:$a, fpimm:$b)>;
+def : Pat<(f64 (fdiv RegF64:$a, RegF64:$b)),
+          (FDIVrr64 RndDefault, RegF64:$a, RegF64:$b)>;
+def : Pat<(f64 (fdiv RegF64:$a, fpimm:$b)),
+          (FDIVri64 RndDefault, RegF64:$a, fpimm:$b)>;
+
+// FMUL+FADD
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), RegF32:$c)),
+          (FMADrrr32 RndDefault, RegF32:$a, RegF32:$b, RegF32:$c)>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
+          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, fpimm:$b), fpimm:$c)),
+          (FMADrrr32 RndDefault, RegF32:$a, fpimm:$b, fpimm:$c)>;
+def : Pat<(f32 (fadd (fmul RegF32:$a, RegF32:$b), fpimm:$c)),
+          (FMADrri32 RndDefault, RegF32:$a, RegF32:$b, fpimm:$c)>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), RegF64:$c)),
+          (FMADrrr64 RndDefault, RegF64:$a, RegF64:$b, RegF64:$c)>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, RegF64:$b), fpimm:$c)),
+          (FMADrri64 RndDefault, RegF64:$a, RegF64:$b, fpimm:$c)>;
+def : Pat<(f64 (fadd (fmul RegF64:$a, fpimm:$b), fpimm:$c)),
+          (FMADrri64 RndDefault, RegF64:$a, fpimm:$b, fpimm:$c)>;
+
+// FNEG
+def : Pat<(f32 (fneg RegF32:$a)), (FNEGrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fneg fpimm:$a)), (FNEGri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fneg RegF64:$a)), (FNEGrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fneg fpimm:$a)), (FNEGri64 RndDefault, fpimm:$a)>;
+
+// FSQRT
+def : Pat<(f32 (fsqrt RegF32:$a)), (FSQRTrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fsqrt fpimm:$a)), (FSQRTri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fsqrt RegF64:$a)), (FSQRTrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fsqrt fpimm:$a)), (FSQRTri64 RndDefault, fpimm:$a)>;
+
+// FSIN
+def : Pat<(f32 (fsin RegF32:$a)), (FSINrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fsin fpimm:$a)), (FSINri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fsin RegF64:$a)), (FSINrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fsin fpimm:$a)), (FSINri64 RndDefault, fpimm:$a)>;
+
+// FCOS
+def : Pat<(f32 (fcos RegF32:$a)), (FCOSrr32 RndDefault, RegF32:$a)>;
+def : Pat<(f32 (fcos fpimm:$a)), (FCOSri32 RndDefault, fpimm:$a)>;
+def : Pat<(f64 (fcos RegF64:$a)), (FCOSrr64 RndDefault, RegF64:$a)>;
+def : Pat<(f64 (fcos fpimm:$a)), (FCOSri64 RndDefault, fpimm:$a)>;
+
+// Type conversion notes:
+// - PTX does not directly support converting a predicate to a value, so we
+//   use a select instruction to select either 0 or 1 (integer or fp) based
+//   on the truth value of the predicate.
+// - PTX does not directly support converting to a predicate type, so we fake it
+//   by performing a greater-than test between the value and zero.  This follows
+//   the C convention that any non-zero value is equivalent to 'true'.
 
-def CVT_pred_f64
-  : InstPTX<(outs RegPred:$d), (ins RegF64:$a), "setp.gt.f64\t$d, $a, 0",
-            [(set RegPred:$d, (fp_to_uint RegF64:$a))]>;
+// Conversion to pred
+def : Pat<(i1 (trunc RegI16:$a)),      (SETPGTu16ri RegI16:$a, 0)>;
+def : Pat<(i1 (trunc RegI32:$a)),      (SETPGTu32ri RegI32:$a, 0)>;
+def : Pat<(i1 (trunc RegI64:$a)),      (SETPGTu64ri RegI64:$a, 0)>;
+def : Pat<(i1 (fp_to_uint RegF32:$a)), (SETPGTu32ri (MOVi32f32 RegF32:$a), 0)>;
+def : Pat<(i1 (fp_to_uint RegF64:$a)), (SETPGTu64ri (MOVi64f64 RegF64:$a), 0)>;
 
 // Conversion to u16
-// PTX does not directly support converting a predicate to a value, so we
-// use a select instruction to select either 0 or 1 (integer or fp) based
-// on the truth value of the predicate.
-def CVT_u16_preda
-  : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a",
-            [(set RegI16:$d, (anyext RegPred:$a))]>;
-
-def CVT_u16_pred
-  : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a",
-            [(set RegI16:$d, (zext RegPred:$a))]>;
-
-def CVT_u16_preds
-  : InstPTX<(outs RegI16:$d), (ins RegPred:$a), "selp.u16\t$d, 1, 0, $a",
-            [(set RegI16:$d, (sext RegPred:$a))]>;
-
-def CVT_u16_u32
-  : InstPTX<(outs RegI16:$d), (ins RegI32:$a), "cvt.u16.u32\t$d, $a",
-            [(set RegI16:$d, (trunc RegI32:$a))]>;
-
-def CVT_u16_u64
-  : InstPTX<(outs RegI16:$d), (ins RegI64:$a), "cvt.u16.u64\t$d, $a",
-            [(set RegI16:$d, (trunc RegI64:$a))]>;
-
-def CVT_u16_f32
-  : InstPTX<(outs RegI16:$d), (ins RegF32:$a), "cvt.rzi.u16.f32\t$d, $a",
-            [(set RegI16:$d, (fp_to_uint RegF32:$a))]>;
-
-def CVT_u16_f64
-  : InstPTX<(outs RegI16:$d), (ins RegF64:$a), "cvt.rzi.u16.f64\t$d, $a",
-            [(set RegI16:$d, (fp_to_uint RegF64:$a))]>;
+def : Pat<(i16 (anyext RegPred:$a)),    (SELPi16ii RegPred:$a, 1, 0)>;
+def : Pat<(i16 (sext RegPred:$a)),      (SELPi16ii RegPred:$a, 0xFFFF, 0)>;
+def : Pat<(i16 (zext RegPred:$a)),      (SELPi16ii RegPred:$a, 1, 0)>;
+def : Pat<(i16 (trunc RegI32:$a)),      (CVTu16u32 RegI32:$a)>;
+def : Pat<(i16 (trunc RegI64:$a)),      (CVTu16u64 RegI64:$a)>;
+def : Pat<(i16 (fp_to_uint RegF32:$a)), (CVTu16f32 RndDefault, RegF32:$a)>;
+def : Pat<(i16 (fp_to_sint RegF32:$a)), (CVTs16f32 RndDefault, RegF32:$a)>;
+def : Pat<(i16 (fp_to_uint RegF64:$a)), (CVTu16f64 RndDefault, RegF64:$a)>;
+def : Pat<(i16 (fp_to_sint RegF64:$a)), (CVTs16f64 RndDefault, RegF64:$a)>;
 
 // Conversion to u32
-
-def CVT_u32_pred
-  : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "selp.u32\t$d, 1, 0, $a",
-            [(set RegI32:$d, (zext RegPred:$a))]>;
-
-def CVT_u32_b16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a",
-            [(set RegI32:$d, (anyext RegI16:$a))]>;
-
-def CVT_u32_u16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.u16\t$d, $a",
-            [(set RegI32:$d, (zext RegI16:$a))]>;
-
-def CVT_u32_preds
-  : InstPTX<(outs RegI32:$d), (ins RegPred:$a), "selp.u32\t$d, 1, 0, $a",
-            [(set RegI32:$d, (sext RegPred:$a))]>;
-
-def CVT_u32_s16
-  : InstPTX<(outs RegI32:$d), (ins RegI16:$a), "cvt.u32.s16\t$d, $a",
-            [(set RegI32:$d, (sext RegI16:$a))]>;
-
-def CVT_u32_u64
-  : InstPTX<(outs RegI32:$d), (ins RegI64:$a), "cvt.u32.u64\t$d, $a",
-            [(set RegI32:$d, (trunc RegI64:$a))]>;
-
-def CVT_u32_f32
-  : InstPTX<(outs RegI32:$d), (ins RegF32:$a), "cvt.rzi.u32.f32\t$d, $a",
-            [(set RegI32:$d, (fp_to_uint RegF32:$a))]>;
-
-def CVT_u32_f64
-  : InstPTX<(outs RegI32:$d), (ins RegF64:$a), "cvt.rzi.u32.f64\t$d, $a",
-            [(set RegI32:$d, (fp_to_uint RegF64:$a))]>;
+def : Pat<(i32 (anyext RegPred:$a)),    (SELPi32ii RegPred:$a, 1, 0)>;
+def : Pat<(i32 (sext RegPred:$a)),      (SELPi32ii RegPred:$a, 0xFFFFFFFF, 0)>;
+def : Pat<(i32 (zext RegPred:$a)),      (SELPi32ii RegPred:$a, 1, 0)>;
+def : Pat<(i32 (anyext RegI16:$a)),     (CVTu32u16 RegI16:$a)>;
+def : Pat<(i32 (sext RegI16:$a)),       (CVTs32s16 RegI16:$a)>;
+def : Pat<(i32 (zext RegI16:$a)),       (CVTu32u16 RegI16:$a)>;
+def : Pat<(i32 (trunc RegI64:$a)),      (CVTu32u64 RegI64:$a)>;
+def : Pat<(i32 (fp_to_uint RegF32:$a)), (CVTu32f32 RndDefault, RegF32:$a)>;
+def : Pat<(i32 (fp_to_sint RegF32:$a)), (CVTs32f32 RndDefault, RegF32:$a)>;
+def : Pat<(i32 (fp_to_uint RegF64:$a)), (CVTu32f64 RndDefault, RegF64:$a)>;
+def : Pat<(i32 (fp_to_sint RegF64:$a)), (CVTs32f64 RndDefault, RegF64:$a)>;
+def : Pat<(i32 (bitconvert RegF32:$a)), (MOVi32f32 RegF32:$a)>;
 
 // Conversion to u64
-
-def CVT_u64_pred
-  : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "selp.u64\t$d, 1, 0, $a",
-            [(set RegI64:$d, (zext RegPred:$a))]>;
-
-def CVT_u64_preds
-  : InstPTX<(outs RegI64:$d), (ins RegPred:$a), "selp.u64\t$d, 1, 0, $a",
-            [(set RegI64:$d, (sext RegPred:$a))]>;
-
-def CVT_u64_u16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.u16\t$d, $a",
-            [(set RegI64:$d, (zext RegI16:$a))]>;
-
-def CVT_u64_s16
-  : InstPTX<(outs RegI64:$d), (ins RegI16:$a), "cvt.u64.s16\t$d, $a",
-            [(set RegI64:$d, (sext RegI16:$a))]>;
-
-def CVT_u64_u32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.u32\t$d, $a",
-            [(set RegI64:$d, (zext RegI32:$a))]>;
-
-def CVT_u64_s32
-  : InstPTX<(outs RegI64:$d), (ins RegI32:$a), "cvt.u64.s32\t$d, $a",
-            [(set RegI64:$d, (sext RegI32:$a))]>;
-
-def CVT_u64_f32
-  : InstPTX<(outs RegI64:$d), (ins RegF32:$a), "cvt.rzi.u64.f32\t$d, $a",
-            [(set RegI64:$d, (fp_to_uint RegF32:$a))]>;
-
-def CVT_u64_f64
-  : InstPTX<(outs RegI64:$d), (ins RegF64:$a), "cvt.rzi.u64.f64\t$d, $a",
-            [(set RegI64:$d, (fp_to_uint RegF64:$a))]>;
+def : Pat<(i64 (anyext RegPred:$a)),    (SELPi64ii RegPred:$a, 1, 0)>;
+def : Pat<(i64 (sext RegPred:$a)),      (SELPi64ii RegPred:$a,
+                                         0xFFFFFFFFFFFFFFFF, 0)>;
+def : Pat<(i64 (zext RegPred:$a)),      (SELPi64ii RegPred:$a, 1, 0)>;
+def : Pat<(i64 (anyext RegI16:$a)),     (CVTu64u16 RegI16:$a)>;
+def : Pat<(i64 (sext RegI16:$a)),       (CVTs64s16 RegI16:$a)>;
+def : Pat<(i64 (zext RegI16:$a)),       (CVTu64u16 RegI16:$a)>;
+def : Pat<(i64 (anyext RegI32:$a)),     (CVTu64u32 RegI32:$a)>;
+def : Pat<(i64 (sext RegI32:$a)),       (CVTs64s32 RegI32:$a)>;
+def : Pat<(i64 (zext RegI32:$a)),       (CVTu64u32 RegI32:$a)>;
+def : Pat<(i64 (fp_to_uint RegF32:$a)), (CVTu64f32 RndDefault, RegF32:$a)>;
+def : Pat<(i64 (fp_to_sint RegF32:$a)), (CVTs64f32 RndDefault, RegF32:$a)>;
+def : Pat<(i64 (fp_to_uint RegF64:$a)), (CVTu64f64 RndDefault, RegF64:$a)>;
+def : Pat<(i64 (fp_to_sint RegF64:$a)), (CVTs64f64 RndDefault, RegF64:$a)>;
+def : Pat<(i64 (bitconvert RegF64:$a)), (MOVi64f64 RegF64:$a)>;
 
 // Conversion to f32
-
-def CVT_f32_pred
-  : InstPTX<(outs RegF32:$d), (ins RegPred:$a),
-            "selp.f32\t$d, 0F3F800000, 0F00000000, $a",  // 1.0
-            [(set RegF32:$d, (uint_to_fp RegPred:$a))]>;
-
-def CVT_f32_u16
-  : InstPTX<(outs RegF32:$d), (ins RegI16:$a), "cvt.rn.f32.u16\t$d, $a",
-            [(set RegF32:$d, (uint_to_fp RegI16:$a))]>;
-
-def CVT_f32_u32
-  : InstPTX<(outs RegF32:$d), (ins RegI32:$a), "cvt.rn.f32.u32\t$d, $a",
-            [(set RegF32:$d, (uint_to_fp RegI32:$a))]>;
-
-def CVT_f32_u64
-  : InstPTX<(outs RegF32:$d), (ins RegI64:$a), "cvt.rn.f32.u64\t$d, $a",
-            [(set RegF32:$d, (uint_to_fp RegI64:$a))]>;
-
-def CVT_f32_f64
-  : InstPTX<(outs RegF32:$d), (ins RegF64:$a), "cvt.rn.f32.f64\t$d, $a",
-            [(set RegF32:$d, (fround RegF64:$a))]>;
+def : Pat<(f32 (uint_to_fp RegPred:$a)), (SELPf32rr RegPred:$a,
+                                        (MOVf32i32 0x3F800000), (MOVf32i32 0))>;
+def : Pat<(f32 (uint_to_fp RegI16:$a)),  (CVTf32u16 RndDefault, RegI16:$a)>;
+def : Pat<(f32 (sint_to_fp RegI16:$a)),  (CVTf32s16 RndDefault, RegI16:$a)>;
+def : Pat<(f32 (uint_to_fp RegI32:$a)),  (CVTf32u32 RndDefault, RegI32:$a)>;
+def : Pat<(f32 (sint_to_fp RegI32:$a)),  (CVTf32s32 RndDefault, RegI32:$a)>;
+def : Pat<(f32 (uint_to_fp RegI64:$a)),  (CVTf32u64 RndDefault, RegI64:$a)>;
+def : Pat<(f32 (sint_to_fp RegI64:$a)),  (CVTf32s64 RndDefault, RegI64:$a)>;
+def : Pat<(f32 (fround RegF64:$a)),      (CVTf32f64 RndDefault, RegF64:$a)>;
+def : Pat<(f32 (bitconvert RegI32:$a)),  (MOVf32i32 RegI32:$a)>;
 
 // Conversion to f64
+def : Pat<(f64 (uint_to_fp RegPred:$a)), (SELPf64rr RegPred:$a,
+                                (MOVf64i64 0x3F80000000000000), (MOVf64i64 0))>;
+def : Pat<(f64 (uint_to_fp RegI16:$a)), (CVTf64u16 RndDefault, RegI16:$a)>;
+def : Pat<(f64 (sint_to_fp RegI16:$a)), (CVTf64s16 RndDefault, RegI16:$a)>;
+def : Pat<(f64 (uint_to_fp RegI32:$a)), (CVTf64u32 RndDefault, RegI32:$a)>;
+def : Pat<(f64 (sint_to_fp RegI32:$a)), (CVTf64s32 RndDefault, RegI32:$a)>;
+def : Pat<(f64 (uint_to_fp RegI64:$a)), (CVTf64u64 RndDefault, RegI64:$a)>;
+def : Pat<(f64 (sint_to_fp RegI64:$a)), (CVTf64s64 RndDefault, RegI64:$a)>;
+def : Pat<(f64 (fextend RegF32:$a)),    (CVTf64f32 RegF32:$a)>;
+def : Pat<(f64 (bitconvert RegI64:$a)), (MOVf64i64 RegI64:$a)>;
 
-def CVT_f64_pred
-  : InstPTX<(outs RegF64:$d), (ins RegPred:$a), 
-            "selp.f64\t$d, 0D3F80000000000000, 0D0000000000000000, $a",  // 1.0
-            [(set RegF64:$d, (uint_to_fp RegPred:$a))]>;
-
-def CVT_f64_u16
-  : InstPTX<(outs RegF64:$d), (ins RegI16:$a), "cvt.rn.f64.u16\t$d, $a",
-            [(set RegF64:$d, (uint_to_fp RegI16:$a))]>;
-
-def CVT_f64_u32
-  : InstPTX<(outs RegF64:$d), (ins RegI32:$a), "cvt.rn.f64.u32\t$d, $a",
-            [(set RegF64:$d, (uint_to_fp RegI32:$a))]>;
-
-def CVT_f64_u64
-  : InstPTX<(outs RegF64:$d), (ins RegI64:$a), "cvt.rn.f64.u64\t$d, $a",
-            [(set RegF64:$d, (uint_to_fp RegI64:$a))]>;
-
-def CVT_f64_f32
-  : InstPTX<(outs RegF64:$d), (ins RegF32:$a), "cvt.f64.f32\t$d, $a",
-            [(set RegF64:$d, (fextend RegF32:$a))]>;
-
-///===- Control Flow Instructions -----------------------------------------===//
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-  def BRAd
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d", [(br bb:$d)]>;
-}
-
-let isBranch = 1, isTerminator = 1 in {
-  // FIXME: The pattern part is blank because I cannot (or do not yet know
-  // how to) use the first operand of PredicateOperand (a RegPred register) here
-  def BRAdp
-    : InstPTX<(outs), (ins brtarget:$d), "bra\t$d",
-              [/*(brcond pred:$_p, bb:$d)*/]>;
-}
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-  def EXIT : InstPTX<(outs), (ins), "exit", [(PTXexit)]>;
-  def RET  : InstPTX<(outs), (ins), "ret",  [(PTXret)]>;
-}
-
-///===- Spill Instructions ------------------------------------------------===//
-// Special instructions used for stack spilling
-def STACKSTOREI16 : InstPTX<(outs), (ins i32imm:$d, RegI16:$a),
-                            "mov.u16\ts$d, $a", []>;
-def STACKSTOREI32 : InstPTX<(outs), (ins i32imm:$d, RegI32:$a),
-                            "mov.u32\ts$d, $a", []>;
-def STACKSTOREI64 : InstPTX<(outs), (ins i32imm:$d, RegI64:$a),
-                            "mov.u64\ts$d, $a", []>;
-def STACKSTOREF32 : InstPTX<(outs), (ins i32imm:$d, RegF32:$a),
-                            "mov.f32\ts$d, $a", []>;
-def STACKSTOREF64 : InstPTX<(outs), (ins i32imm:$d, RegF64:$a),
-                            "mov.f64\ts$d, $a", []>;
-
-def STACKLOADI16 : InstPTX<(outs), (ins RegI16:$d, i32imm:$a),
-                           "mov.u16\t$d, s$a", []>;
-def STACKLOADI32 : InstPTX<(outs), (ins RegI32:$d, i32imm:$a),
-                           "mov.u32\t$d, s$a", []>;
-def STACKLOADI64 : InstPTX<(outs), (ins RegI64:$d, i32imm:$a),
-                           "mov.u64\t$d, s$a", []>;
-def STACKLOADF32 : InstPTX<(outs), (ins RegF32:$d, i32imm:$a),
-                           "mov.f32\t$d, s$a", []>;
-def STACKLOADF64 : InstPTX<(outs), (ins RegF64:$d, i32imm:$a),
-                           "mov.f64\t$d, s$a", []>;
 
 ///===- Intrinsic Instructions --------------------------------------------===//
-
 include "PTXIntrinsicInstrInfo.td"
+
+///===- Load/Store Instructions -------------------------------------------===//
+include "PTXInstrLoadStore.td"
+
diff --git a/lib/Target/PTX/PTXInstrLoadStore.td b/lib/Target/PTX/PTXInstrLoadStore.td
new file mode 100644
index 0000000..9b4f56c
--- /dev/null
+++ b/lib/Target/PTX/PTXInstrLoadStore.td
@@ -0,0 +1,278 @@
+//===- PTXInstrLoadStore.td - PTX Load/Store Instruction Defs -*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PTX load/store instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+
+// Addressing Predicates
+// We have to differentiate between 32- and 64-bit pointer types
+def Use32BitAddresses : Predicate<"!getSubtarget().is64Bit()">;
+def Use64BitAddresses : Predicate<"getSubtarget().is64Bit()">;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments for Loads/Stores
+//===----------------------------------------------------------------------===//
+
+def load_global : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Global;
+  return false;
+}]>;
+
+def load_constant : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Constant;
+  return false;
+}]>;
+
+def load_shared : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<LoadSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Shared;
+  return false;
+}]>;
+
+def store_global
+  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Global;
+  return false;
+}]>;
+
+def store_shared
+  : PatFrag<(ops node:$d, node:$ptr), (store node:$d, node:$ptr), [{
+  const Value *Src;
+  const PointerType *PT;
+  if ((Src = cast<StoreSDNode>(N)->getSrcValue()) &&
+      (PT = dyn_cast<PointerType>(Src->getType())))
+    return PT->getAddressSpace() == PTXStateSpace::Shared;
+  return false;
+}]>;
+
+// Addressing modes.
+def ADDRrr32    : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRrr64    : ComplexPattern<i64, 2, "SelectADDRrr", [], []>;
+def ADDRri32    : ComplexPattern<i32, 2, "SelectADDRri", [], []>;
+def ADDRri64    : ComplexPattern<i64, 2, "SelectADDRri", [], []>;
+def ADDRii32    : ComplexPattern<i32, 2, "SelectADDRii", [], []>;
+def ADDRii64    : ComplexPattern<i64, 2, "SelectADDRii", [], []>;
+def ADDRlocal32 : ComplexPattern<i32, 2, "SelectADDRlocal", [], []>;
+def ADDRlocal64 : ComplexPattern<i64, 2, "SelectADDRlocal", [], []>;
+
+// Address operands
+def MEMri32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops RegI32, i32imm);
+}
+def MEMri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops RegI64, i64imm);
+}
+def LOCALri32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+def LOCALri64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i64imm, i64imm);
+}
+def MEMii32 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+def MEMii64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i64imm, i64imm);
+}
+// The operand here does not correspond to an actual address, so we
+// can use i32 in 64-bit address modes.
+def MEMpi : Operand<i32> {
+  let PrintMethod = "printParamOperand";
+  let MIOperandInfo = (ops i32imm);
+}
+def MEMret : Operand<i32> {
+  let PrintMethod = "printReturnOperand";
+  let MIOperandInfo = (ops i32imm);
+}
+
+
+// Load/store .param space
+def PTXloadparam
+  : SDNode<"PTXISD::LOAD_PARAM", SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>,
+           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+def PTXstoreparam
+  : SDNode<"PTXISD::STORE_PARAM", SDTypeProfile<0, 2, [SDTCisVT<0, i32>]>,
+           [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
+def PTXreadparam
+  : SDNode<"PTXISD::READ_PARAM", SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>,
+      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+def PTXwriteparam
+  : SDNode<"PTXISD::WRITE_PARAM", SDTypeProfile<0, 1, []>,
+      [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue]>;
+
+
+
+//===----------------------------------------------------------------------===//
+// Classes for loads/stores
+//===----------------------------------------------------------------------===//
+multiclass PTX_LD<string opstr, string typestr,
+           RegisterClass RC, PatFrag pat_load> {
+  def rr32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRrr64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs RC:$d),
+                     (ins MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs RC:$d),
+                     (ins MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRri64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs RC:$d),
+                     (ins MEMii32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii32:$a))]>,
+                     Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs RC:$d),
+                     (ins MEMii64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (pat_load ADDRii64:$a))]>,
+                     Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_ST<string opstr, string typestr, RegisterClass RC,
+                  PatFrag pat_store> {
+  def rr32 : InstPTX<(outs),
+                     (ins RC:$d, MEMri32:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr32:$a)]>,
+                     Requires<[Use32BitAddresses]>;
+  def rr64 : InstPTX<(outs),
+                     (ins RC:$d, MEMri64:$a),
+                     !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                     [(pat_store RC:$d, ADDRrr64:$a)]>,
+                     Requires<[Use64BitAddresses]>;
+  def ri32 : InstPTX<(outs),
+                   (ins RC:$d, MEMri32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri32:$a)]>,
+                   Requires<[Use32BitAddresses]>;
+  def ri64 : InstPTX<(outs),
+                   (ins RC:$d, MEMri64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRri64:$a)]>,
+                   Requires<[Use64BitAddresses]>;
+  def ii32 : InstPTX<(outs),
+                   (ins RC:$d, MEMii32:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii32:$a)]>,
+                   Requires<[Use32BitAddresses]>;
+  def ii64 : InstPTX<(outs),
+                   (ins RC:$d, MEMii64:$a),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
+                   [(pat_store RC:$d, ADDRii64:$a)]>,
+                   Requires<[Use64BitAddresses]>;
+}
+
+multiclass PTX_LOCAL_LD_ST<string typestr, RegisterClass RC> {
+  def LDri32 : InstPTX<(outs RC:$d), (ins LOCALri32:$a),
+                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
+                       [(set RC:$d, (load_global ADDRlocal32:$a))]>;
+  def LDri64 : InstPTX<(outs RC:$d), (ins LOCALri64:$a),
+                      !strconcat("ld.local", !strconcat(typestr, "\t$d, [$a]")),
+                       [(set RC:$d, (load_global ADDRlocal64:$a))]>;
+  def STri32 : InstPTX<(outs), (ins RC:$d, LOCALri32:$a),
+                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
+                       [(store_global RC:$d, ADDRlocal32:$a)]>;
+  def STri64 : InstPTX<(outs), (ins RC:$d, LOCALri64:$a),
+                      !strconcat("st.local", !strconcat(typestr, "\t[$a], $d")),
+                       [(store_global RC:$d, ADDRlocal64:$a)]>;
+}
+
+multiclass PTX_PARAM_LD_ST<string typestr, RegisterClass RC> {
+  let hasSideEffects = 1 in {
+  def LDpi : InstPTX<(outs RC:$d), (ins i32imm:$a),
+                     !strconcat("ld.param", !strconcat(typestr, "\t$d, [$a]")),
+                     [(set RC:$d, (PTXloadparam texternalsym:$a))]>;
+  def STpi : InstPTX<(outs), (ins i32imm:$d, RC:$a),
+                     !strconcat("st.param", !strconcat(typestr, "\t[$d], $a")),
+                     [(PTXstoreparam texternalsym:$d, RC:$a)]>;
+  }
+}
+
+multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
+  defm u16 : PTX_LD<opstr, ".u16", RegI16, pat_load>;
+  defm u32 : PTX_LD<opstr, ".u32", RegI32, pat_load>;
+  defm u64 : PTX_LD<opstr, ".u64", RegI64, pat_load>;
+  defm f32 : PTX_LD<opstr, ".f32", RegF32, pat_load>;
+  defm f64 : PTX_LD<opstr, ".f64", RegF64, pat_load>;
+}
+
+multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
+  defm u16 : PTX_ST<opstr, ".u16", RegI16, pat_store>;
+  defm u32 : PTX_ST<opstr, ".u32", RegI32, pat_store>;
+  defm u64 : PTX_ST<opstr, ".u64", RegI64, pat_store>;
+  defm f32 : PTX_ST<opstr, ".f32", RegF32, pat_store>;
+  defm f64 : PTX_ST<opstr, ".f64", RegF64, pat_store>;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Instruction definitions for loads/stores
+//===----------------------------------------------------------------------===//
+
+// Global/shared stores
+defm STg : PTX_ST_ALL<"st.global", store_global>;
+defm STs : PTX_ST_ALL<"st.shared", store_shared>;
+
+// Global/shared/constant loads
+defm LDg : PTX_LD_ALL<"ld.global", load_global>;
+defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
+defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
+
+// Param loads/stores
+defm PARAMPRED : PTX_PARAM_LD_ST<".pred", RegPred>;
+defm PARAMU16  : PTX_PARAM_LD_ST<".u16", RegI16>;
+defm PARAMU32  : PTX_PARAM_LD_ST<".u32", RegI32>;
+defm PARAMU64  : PTX_PARAM_LD_ST<".u64", RegI64>;
+defm PARAMF32  : PTX_PARAM_LD_ST<".f32", RegF32>;
+defm PARAMF64  : PTX_PARAM_LD_ST<".f64", RegF64>;
+
+// Local loads/stores
+defm LOCALPRED : PTX_LOCAL_LD_ST<".pred", RegPred>;
+defm LOCALU16  : PTX_LOCAL_LD_ST<".u16", RegI16>;
+defm LOCALU32  : PTX_LOCAL_LD_ST<".u32", RegI32>;
+defm LOCALU64  : PTX_LOCAL_LD_ST<".u64", RegI64>;
+defm LOCALF32  : PTX_LOCAL_LD_ST<".f32", RegF32>;
+defm LOCALF64  : PTX_LOCAL_LD_ST<".f64", RegF64>;
+
diff --git a/lib/Target/PTX/PTXIntrinsicInstrInfo.td b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
index 8d97909..9de1cb6 100644
--- a/lib/Target/PTX/PTXIntrinsicInstrInfo.td
+++ b/lib/Target/PTX/PTXIntrinsicInstrInfo.td
@@ -25,37 +25,63 @@ class PTX_READ_SPECIAL_REGISTER_R32<string regname, Intrinsic intop>
 
 // TODO Add read vector-version of special registers
 
-//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid", int_ptx_read_tid_r64>;
-def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x", int_ptx_read_tid_x>;
-def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y", int_ptx_read_tid_y>;
-def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z", int_ptx_read_tid_z>;
-def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w", int_ptx_read_tid_w>;
+//def PTX_READ_TID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"tid",
+//                                                     int_ptx_read_tid_r64>;
+def PTX_READ_TID_X   : PTX_READ_SPECIAL_REGISTER_R32<"tid.x",
+                                                     int_ptx_read_tid_x>;
+def PTX_READ_TID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"tid.y",
+                                                     int_ptx_read_tid_y>;
+def PTX_READ_TID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"tid.z",
+                                                     int_ptx_read_tid_z>;
+def PTX_READ_TID_W   : PTX_READ_SPECIAL_REGISTER_R32<"tid.w",
+                                                     int_ptx_read_tid_w>;
 
-//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid", int_ptx_read_ntid_r64>;
-def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x", int_ptx_read_ntid_x>;
-def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y", int_ptx_read_ntid_y>;
-def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z", int_ptx_read_ntid_z>;
-def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w", int_ptx_read_ntid_w>;
+//def PTX_READ_NTID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ntid",
+//                                                      int_ptx_read_ntid_r64>;
+def PTX_READ_NTID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.x",
+                                                      int_ptx_read_ntid_x>;
+def PTX_READ_NTID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.y",
+                                                      int_ptx_read_ntid_y>;
+def PTX_READ_NTID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.z",
+                                                      int_ptx_read_ntid_z>;
+def PTX_READ_NTID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ntid.w",
+                                                      int_ptx_read_ntid_w>;
 
-def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid", int_ptx_read_laneid>;
-def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid", int_ptx_read_warpid>;
-def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid", int_ptx_read_nwarpid>;
+def PTX_READ_LANEID  : PTX_READ_SPECIAL_REGISTER_R32<"laneid",
+                                                     int_ptx_read_laneid>;
+def PTX_READ_WARPID  : PTX_READ_SPECIAL_REGISTER_R32<"warpid",
+                                                     int_ptx_read_warpid>;
+def PTX_READ_NWARPID : PTX_READ_SPECIAL_REGISTER_R32<"nwarpid",
+                                                     int_ptx_read_nwarpid>;
 
-//def PTX_READ_CTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
-def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x", int_ptx_read_ctaid_x>;
-def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y", int_ptx_read_ctaid_y>;
-def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z", int_ptx_read_ctaid_z>;
-def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w", int_ptx_read_ctaid_w>;
+//def PTX_READ_CTAID_R64 :
+//PTX_READ_SPECIAL_REGISTER_R64<"ctaid", int_ptx_read_ctaid_r64>;
+def PTX_READ_CTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.x",
+                                                       int_ptx_read_ctaid_x>;
+def PTX_READ_CTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.y",
+                                                       int_ptx_read_ctaid_y>;
+def PTX_READ_CTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.z",
+                                                       int_ptx_read_ctaid_z>;
+def PTX_READ_CTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"ctaid.w",
+                                                       int_ptx_read_ctaid_w>;
 
-//def PTX_READ_NCTAID_R64 : PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
-def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x", int_ptx_read_nctaid_x>;
-def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y", int_ptx_read_nctaid_y>;
-def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z", int_ptx_read_nctaid_z>;
-def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w", int_ptx_read_nctaid_w>;
+//def PTX_READ_NCTAID_R64 :
+//PTX_READ_SPECIAL_REGISTER_R64<"nctaid", int_ptx_read_nctaid_r64>;
+def PTX_READ_NCTAID_X   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.x",
+                                                        int_ptx_read_nctaid_x>;
+def PTX_READ_NCTAID_Y   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.y",
+                                                        int_ptx_read_nctaid_y>;
+def PTX_READ_NCTAID_Z   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.z",
+                                                        int_ptx_read_nctaid_z>;
+def PTX_READ_NCTAID_W   : PTX_READ_SPECIAL_REGISTER_R32<"nctaid.w",
+                                                        int_ptx_read_nctaid_w>;
 
-def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid", int_ptx_read_smid>;
-def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid", int_ptx_read_nsmid>;
-def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid", int_ptx_read_gridid>;
+def PTX_READ_SMID  : PTX_READ_SPECIAL_REGISTER_R32<"smid",
+                                                   int_ptx_read_smid>;
+def PTX_READ_NSMID  : PTX_READ_SPECIAL_REGISTER_R32<"nsmid",
+                                                    int_ptx_read_nsmid>;
+def PTX_READ_GRIDID  : PTX_READ_SPECIAL_REGISTER_R32<"gridid",
+                                                     int_ptx_read_gridid>;
 
 def PTX_READ_LANEMASK_EQ
   : PTX_READ_SPECIAL_REGISTER_R32<"lanemask_eq", int_ptx_read_lanemask_eq>;
diff --git a/lib/Target/PTX/PTXMCAsmStreamer.cpp b/lib/Target/PTX/PTXMCAsmStreamer.cpp
index b13a3da..468ce93 100644
--- a/lib/Target/PTX/PTXMCAsmStreamer.cpp
+++ b/lib/Target/PTX/PTXMCAsmStreamer.cpp
@@ -100,7 +100,7 @@ public:
   /// @{
 
   virtual void ChangeSection(const MCSection *Section);
-  virtual void InitSections() {}
+  virtual void InitSections() { /* PTX does not use sections */ }
 
   virtual void EmitLabel(MCSymbol *Symbol);
 
@@ -132,7 +132,9 @@ public:
   ///
   /// @param Symbol - The common symbol to emit.
   /// @param Size - The size of the common symbol.
-  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size);
+  /// @param ByteAlignment - The alignment of the common symbol in bytes.
+  virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                     unsigned ByteAlignment);
 
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             unsigned Size = 0, unsigned ByteAlignment = 0);
@@ -233,7 +235,7 @@ void PTXMCAsmStreamer::ChangeSection(const MCSection *Section) {
 void PTXMCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
-  //assert(getCurrentSection() && "Cannot emit before setting section!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
 
   OS << *Symbol << MAI.getLabelSuffix();
   EmitEOL();
@@ -283,7 +285,8 @@ void PTXMCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
 void PTXMCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                         unsigned ByteAlignment) {}
 
-void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size) {}
+void PTXMCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                             unsigned ByteAlignment) {}
 
 void PTXMCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                     unsigned Size, unsigned ByteAlignment) {}
@@ -510,7 +513,7 @@ void PTXMCAsmStreamer::EmitInstruction(const MCInst &Inst) {
 
   // If we have an AsmPrinter, use that to print, otherwise print the MCInst.
   if (InstPrinter)
-    InstPrinter->printInst(&Inst, OS);
+    InstPrinter->printInst(&Inst, OS, "");
   else
     Inst.print(OS, &MAI);
   EmitEOL();
@@ -533,7 +536,7 @@ namespace llvm {
                                    formatted_raw_ostream &OS,
                                    bool isVerboseAsm, bool useLoc, bool useCFI,
                                    MCInstPrinter *IP,
-                                   MCCodeEmitter *CE, TargetAsmBackend *TAB,
+                                   MCCodeEmitter *CE, MCAsmBackend *MAB,
                                    bool ShowInst) {
     return new PTXMCAsmStreamer(Context, OS, isVerboseAsm, useLoc,
                                 IP, CE, ShowInst);
diff --git a/lib/Target/PTX/PTXMCInstLower.cpp b/lib/Target/PTX/PTXMCInstLower.cpp
new file mode 100644
index 0000000..142e639
--- /dev/null
+++ b/lib/Target/PTX/PTXMCInstLower.cpp
@@ -0,0 +1,32 @@
+//===-- PTXMCInstLower.cpp - Convert PTX MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower PTX MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTX.h"
+#include "PTXAsmPrinter.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+void llvm::LowerPTXMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
+                                        PTXAsmPrinter &AP) {
+  OutMI.setOpcode(MI->getOpcode());
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp;
+    OutMI.addOperand(AP.lowerOperand(MO));
+  }
+}
+
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
index 6fe9e6c..b33a273 100644
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ b/lib/Target/PTX/PTXMFInfoExtract.cpp
@@ -52,36 +52,12 @@ bool PTXMFInfoExtract::runOnMachineFunction(MachineFunction &MF) {
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  DEBUG(dbgs() << "******** PTX FUNCTION LOCAL VAR REG DEF ********\n");
-
-  DEBUG(dbgs()
-        << "PTX::NoRegister == " << PTX::NoRegister << "\n"
-        << "PTX::NUM_TARGET_REGS == " << PTX::NUM_TARGET_REGS << "\n");
-
-  DEBUG(for (unsigned reg = PTX::NoRegister + 1;
-             reg < PTX::NUM_TARGET_REGS; ++reg)
-          if (MRI.isPhysRegUsed(reg))
-            dbgs() << "Used Reg: " << reg << "\n";);
-
-  // FIXME: This is a slow linear scanning
-  for (unsigned reg = PTX::NoRegister + 1; reg < PTX::NUM_TARGET_REGS; ++reg)
-    if (MRI.isPhysRegUsed(reg) &&
-        !MFI->isRetReg(reg) &&
-        (MFI->isKernel() || !MFI->isArgReg(reg)))
-      MFI->addLocalVarReg(reg);
-
-  // Notify MachineFunctionInfo that I've done adding local var reg
-  MFI->doneAddLocalVar();
-
-  DEBUG(for (PTXMachineFunctionInfo::reg_iterator
-             i = MFI->argRegBegin(), e = MFI->argRegEnd();
-             i != e; ++i)
-        dbgs() << "Arg Reg: " << *i << "\n";);
-
-  DEBUG(for (PTXMachineFunctionInfo::reg_iterator
-             i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd();
-             i != e; ++i)
-        dbgs() << "Local Var Reg: " << *i << "\n";);
+  // Generate list of all virtual registers used in this function
+  for (unsigned i = 0; i < MRI.getNumVirtRegs(); ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    const TargetRegisterClass *TRC = MRI.getRegClass(Reg);
+    MFI->addVirtualRegister(TRC, Reg);
+  }
 
   return false;
 }
diff --git a/lib/Target/PTX/PTXMachineFunctionInfo.h b/lib/Target/PTX/PTXMachineFunctionInfo.h
index 9d65f5b..3b985f7 100644
--- a/lib/Target/PTX/PTXMachineFunctionInfo.h
+++ b/lib/Target/PTX/PTXMachineFunctionInfo.h
@@ -15,75 +15,148 @@
 #define PTX_MACHINE_FUNCTION_INFO_H
 
 #include "PTX.h"
+#include "PTXParamManager.h"
+#include "PTXRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+
 /// PTXMachineFunctionInfo - This class is derived from MachineFunction and
 /// contains private PTX target-specific information for each MachineFunction.
 ///
 class PTXMachineFunctionInfo : public MachineFunctionInfo {
 private:
-  bool is_kernel;
-  std::vector<unsigned> reg_arg, reg_local_var;
-  std::vector<unsigned> reg_ret;
-  bool _isDoneAddArg;
+  bool IsKernel;
+  DenseSet<unsigned> RegArgs;
+  DenseSet<unsigned> RegRets;
+
+  typedef std::vector<unsigned> RegisterList;
+  typedef DenseMap<const TargetRegisterClass*, RegisterList> RegisterMap;
+  typedef DenseMap<unsigned, std::string> RegisterNameMap;
+  typedef DenseMap<int, std::string> FrameMap;
+
+  RegisterMap UsedRegs;
+  RegisterNameMap RegNames;
+  FrameMap FrameSymbols;
+
+  PTXParamManager ParamManager;
 
 public:
+  typedef DenseSet<unsigned>::const_iterator reg_iterator;
+
   PTXMachineFunctionInfo(MachineFunction &MF)
-    : is_kernel(false), reg_ret(PTX::NoRegister), _isDoneAddArg(false) {
-      reg_arg.reserve(8);
-      reg_local_var.reserve(32);
+    : IsKernel(false) {
+      UsedRegs[PTX::RegPredRegisterClass] = RegisterList();
+      UsedRegs[PTX::RegI16RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegI32RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegI64RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegF32RegisterClass] = RegisterList();
+      UsedRegs[PTX::RegF64RegisterClass] = RegisterList();
     }
 
-  void setKernel(bool _is_kernel=true) { is_kernel = _is_kernel; }
-
-  void addArgReg(unsigned reg) { reg_arg.push_back(reg); }
-  void addLocalVarReg(unsigned reg) { reg_local_var.push_back(reg); }
-  void addRetReg(unsigned reg) {
-    if (!isRetReg(reg)) {
-      reg_ret.push_back(reg);
+  /// getParamManager - Returns the PTXParamManager instance for this function.
+  PTXParamManager& getParamManager() { return ParamManager; }
+  const PTXParamManager& getParamManager() const { return ParamManager; }
+
+  /// setKernel/isKernel - Gets/sets a flag that indicates if this function is
+  /// a PTX kernel function.
+  void setKernel(bool _IsKernel=true) { IsKernel = _IsKernel; }
+  bool isKernel() const { return IsKernel; }
+
+  /// argreg_begin/argreg_end - Returns iterators to the set of registers
+  /// containing function arguments.
+  reg_iterator argreg_begin() const { return RegArgs.begin(); }
+  reg_iterator argreg_end()   const { return RegArgs.end(); }
+
+  /// retreg_begin/retreg_end - Returns iterators to the set of registers
+  /// containing the function return values.
+  reg_iterator retreg_begin() const { return RegRets.begin(); }
+  reg_iterator retreg_end()   const { return RegRets.end(); }
+
+  /// addRetReg - Adds a register to the set of return-value registers.
+  void addRetReg(unsigned Reg) {
+    if (!RegRets.count(Reg)) {
+      RegRets.insert(Reg);
+      std::string name;
+      name = "%ret";
+      name += utostr(RegRets.size() - 1);
+      RegNames[Reg] = name;
     }
   }
 
-  void doneAddArg(void) {
-    _isDoneAddArg = true;
+  /// addArgReg - Adds a register to the set of function argument registers.
+  void addArgReg(unsigned Reg) {
+    RegArgs.insert(Reg);
+    std::string name;
+    name = "%param";
+    name += utostr(RegArgs.size() - 1);
+    RegNames[Reg] = name;
   }
-  void doneAddLocalVar(void) {}
-
-  bool isKernel() const { return is_kernel; }
 
-  typedef std::vector<unsigned>::const_iterator         reg_iterator;
-  typedef std::vector<unsigned>::const_reverse_iterator reg_reverse_iterator;
-  typedef std::vector<unsigned>::const_iterator         ret_iterator;
-
-  bool         argRegEmpty() const { return reg_arg.empty(); }
-  int          getNumArg() const { return reg_arg.size(); }
-  reg_iterator argRegBegin() const { return reg_arg.begin(); }
-  reg_iterator argRegEnd()   const { return reg_arg.end(); }
-  reg_reverse_iterator argRegReverseBegin() const { return reg_arg.rbegin(); }
-  reg_reverse_iterator argRegReverseEnd() const { return reg_arg.rend(); }
-
-  bool         localVarRegEmpty() const { return reg_local_var.empty(); }
-  reg_iterator localVarRegBegin() const { return reg_local_var.begin(); }
-  reg_iterator localVarRegEnd()   const { return reg_local_var.end(); }
-
-  bool         retRegEmpty() const { return reg_ret.empty(); }
-  int          getNumRet() const { return reg_ret.size(); }
-  ret_iterator retRegBegin() const { return reg_ret.begin(); }
-  ret_iterator retRegEnd()   const { return reg_ret.end(); }
+  /// addVirtualRegister - Adds a virtual register to the set of all used
+  /// registers in the function.
+  void addVirtualRegister(const TargetRegisterClass *TRC, unsigned Reg) {
+    std::string name;
+
+    // Do not count registers that are argument/return registers.
+    if (!RegRets.count(Reg) && !RegArgs.count(Reg)) {
+      UsedRegs[TRC].push_back(Reg);
+      if (TRC == PTX::RegPredRegisterClass)
+        name = "%p";
+      else if (TRC == PTX::RegI16RegisterClass)
+        name = "%rh";
+      else if (TRC == PTX::RegI32RegisterClass)
+        name = "%r";
+      else if (TRC == PTX::RegI64RegisterClass)
+        name = "%rd";
+      else if (TRC == PTX::RegF32RegisterClass)
+        name = "%f";
+      else if (TRC == PTX::RegF64RegisterClass)
+        name = "%fd";
+      else
+        llvm_unreachable("Invalid register class");
+
+      name += utostr(UsedRegs[TRC].size() - 1);
+      RegNames[Reg] = name;
+    }
+  }
 
-  bool isArgReg(unsigned reg) const {
-    return std::find(reg_arg.begin(), reg_arg.end(), reg) != reg_arg.end();
+  /// getRegisterName - Returns the name of the specified virtual register. This
+  /// name is used during PTX emission.
+  const char *getRegisterName(unsigned Reg) const {
+    if (RegNames.count(Reg))
+      return RegNames.find(Reg)->second.c_str();
+    else if (Reg == PTX::NoRegister)
+      return "%noreg";
+    else
+      llvm_unreachable("Register not in register name map");
   }
 
-  bool isRetReg(unsigned reg) const {
-    return std::find(reg_ret.begin(), reg_ret.end(), reg) != reg_ret.end();
+  /// getNumRegistersForClass - Returns the number of virtual registers that are
+  /// used for the specified register class.
+  unsigned getNumRegistersForClass(const TargetRegisterClass *TRC) const {
+    return UsedRegs.lookup(TRC).size();
   }
 
-  bool isLocalVarReg(unsigned reg) const {
-    return std::find(reg_local_var.begin(), reg_local_var.end(), reg)
-      != reg_local_var.end();
+  /// getFrameSymbol - Returns the symbol name for the given FrameIndex.
+  const char* getFrameSymbol(int FrameIndex) {
+    if (FrameSymbols.count(FrameIndex)) {
+      return FrameSymbols.lookup(FrameIndex).c_str();
+    } else {
+      std::string Name = "__local";
+      Name += utostr(FrameIndex);
+      // The whole point of caching this name is to ensure the pointer we pass
+      // to any getExternalSymbol() calls will remain valid for the lifetime of
+      // the back-end instance. This is to work around an issue in SelectionDAG
+      // where symbol names are expected to be life-long strings.
+      FrameSymbols[FrameIndex] = Name;
+      return FrameSymbols[FrameIndex].c_str();
+    }
   }
 }; // class PTXMachineFunctionInfo
 } // namespace llvm
diff --git a/lib/Target/PTX/PTXParamManager.cpp b/lib/Target/PTX/PTXParamManager.cpp
new file mode 100644
index 0000000..7753787
--- /dev/null
+++ b/lib/Target/PTX/PTXParamManager.cpp
@@ -0,0 +1,73 @@
+//===- PTXParamManager.cpp - Manager for .param variables -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTXParamManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PTX.h"
+#include "PTXParamManager.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+
+PTXParamManager::PTXParamManager() {
+}
+
+unsigned PTXParamManager::addArgumentParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_ARGUMENT;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__param_";
+  Name += utostr(ArgumentParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  ArgumentParams.push_back(Index);
+
+  return Index;
+}
+
+unsigned PTXParamManager::addReturnParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_RETURN;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__ret_";
+  Name += utostr(ReturnParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  ReturnParams.push_back(Index);
+
+  return Index;
+}
+
+unsigned PTXParamManager::addLocalParam(unsigned Size) {
+  PTXParam Param;
+  Param.Type = PTX_PARAM_TYPE_LOCAL;
+  Param.Size = Size;
+
+  std::string Name;
+  Name = "__localparam_";
+  Name += utostr(LocalParams.size()+1);
+  Param.Name = Name;
+
+  unsigned Index = AllParams.size();
+  AllParams[Index] = Param;
+  LocalParams.push_back(Index);
+
+  return Index;
+}
+
diff --git a/lib/Target/PTX/PTXParamManager.h b/lib/Target/PTX/PTXParamManager.h
new file mode 100644
index 0000000..9fd2de5
--- /dev/null
+++ b/lib/Target/PTX/PTXParamManager.h
@@ -0,0 +1,86 @@
+//===- PTXParamManager.h - Manager for .param variables ----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PTXParamManager class, which manages all defined .param
+// variables for a particular function.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTX_PARAM_MANAGER_H
+#define PTX_PARAM_MANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+/// PTXParamManager - This class manages all .param variables defined for a
+/// particular function.
+class PTXParamManager {
+private:
+
+  /// PTXParamType - Type of a .param variable
+  enum PTXParamType {
+    PTX_PARAM_TYPE_ARGUMENT,
+    PTX_PARAM_TYPE_RETURN,
+    PTX_PARAM_TYPE_LOCAL
+  };
+
+  /// PTXParam - Definition of a PTX .param variable
+  struct PTXParam {
+    PTXParamType  Type;
+    unsigned      Size;
+    std::string   Name;
+  };
+
+  DenseMap<unsigned, PTXParam> AllParams;
+  SmallVector<unsigned, 4> ArgumentParams;
+  SmallVector<unsigned, 4> ReturnParams;
+  SmallVector<unsigned, 4> LocalParams;
+
+public:
+
+  typedef SmallVector<unsigned, 4>::const_iterator param_iterator;
+
+  PTXParamManager();
+
+  param_iterator arg_begin() const { return ArgumentParams.begin(); }
+  param_iterator arg_end() const { return ArgumentParams.end(); }
+  param_iterator ret_begin() const { return ReturnParams.begin(); }
+  param_iterator ret_end() const { return ReturnParams.end(); }
+  param_iterator local_begin() const { return LocalParams.begin(); }
+  param_iterator local_end() const { return LocalParams.end(); }
+
+  /// addArgumentParam - Returns a new .param used as an argument.
+  unsigned addArgumentParam(unsigned Size);
+
+  /// addReturnParam - Returns a new .param used as a return argument.
+  unsigned addReturnParam(unsigned Size);
+
+  /// addLocalParam - Returns a new .param used as a local .param variable.
+  unsigned addLocalParam(unsigned Size);
+
+  /// getParamName - Returns the name of the parameter as a string.
+  const std::string &getParamName(unsigned Param) const {
+    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
+    return AllParams.find(Param)->second.Name;
+  }
+
+  /// getParamSize - Returns the size of the parameter in bits.
+  unsigned getParamSize(unsigned Param) const {
+    assert(AllParams.count(Param) == 1 && "Param has not been defined!");
+    return AllParams.find(Param)->second.Size;
+  }
+
+};
+
+}
+
+#endif
+
diff --git a/lib/Target/PTX/PTXRegAlloc.cpp b/lib/Target/PTX/PTXRegAlloc.cpp
new file mode 100644
index 0000000..2d2d5c3
--- /dev/null
+++ b/lib/Target/PTX/PTXRegAlloc.cpp
@@ -0,0 +1,58 @@
+//===-- PTXRegAlloc.cpp - PTX Register Allocator --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a register allocator for PTX code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-reg-alloc"
+
+#include "PTX.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+
+using namespace llvm;
+
+namespace {
+  // Special register allocator for PTX.
+  class PTXRegAlloc : public MachineFunctionPass {
+  public:
+    static char ID;
+    PTXRegAlloc() : MachineFunctionPass(ID) {
+      initializePHIEliminationPass(*PassRegistry::getPassRegistry());
+      initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual const char* getPassName() const {
+      return "PTX Register Allocator";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(PHIEliminationID);
+      AU.addRequiredID(TwoAddressInstructionPassID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      // We do not actually do anything (at least not yet).
+      return false;
+    }
+  };
+
+  char PTXRegAlloc::ID = 0;
+
+  static RegisterRegAlloc
+    ptxRegAlloc("ptx", "PTX register allocator", createPTXRegisterAllocator);
+}
+
+FunctionPass *llvm::createPTXRegisterAllocator() {
+  return new PTXRegAlloc();
+}
+
diff --git a/lib/Target/PTX/PTXRegisterInfo.cpp b/lib/Target/PTX/PTXRegisterInfo.cpp
index cb56ea9..c806266 100644
--- a/lib/Target/PTX/PTXRegisterInfo.cpp
+++ b/lib/Target/PTX/PTXRegisterInfo.cpp
@@ -14,6 +14,9 @@
 #include "PTX.h"
 #include "PTXRegisterInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -23,15 +26,23 @@
 using namespace llvm;
 
 PTXRegisterInfo::PTXRegisterInfo(PTXTargetMachine &TM,
-                                 const TargetInstrInfo &TII)
-  : PTXGenRegisterInfo() {
+                                 const TargetInstrInfo &tii)
+  // PTX does not have a return address register.
+  : PTXGenRegisterInfo(0), TII(tii) {
 }
 
 void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                           int SPAdj,
                                           RegScavenger *RS) const {
   unsigned Index;
-  MachineInstr& MI = *II;
+  MachineInstr &MI = *II;
+  //MachineBasicBlock &MBB = *MI.getParent();
+  //DebugLoc dl = MI.getDebugLoc();
+  //MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+
+  //unsigned Reg = MRI.createVirtualRegister(PTX::RegF32RegisterClass);
+
+  llvm_unreachable("FrameIndex should have been previously eliminated!");
 
   Index = 0;
   while (!MI.getOperand(Index).isFI()) {
@@ -46,6 +57,18 @@ void PTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   DEBUG(dbgs() << "- SPAdj: " << SPAdj << "\n");
   DEBUG(dbgs() << "- FrameIndex: " << FrameIndex << "\n");
 
+  //MachineInstr* MI2 = BuildMI(MBB, II, dl, TII.get(PTX::LOAD_LOCAL_F32))
+  //.addReg(Reg, RegState::Define).addImm(FrameIndex);
+  //if (MI2->findFirstPredOperandIdx() == -1) {
+  //  MI2->addOperand(MachineOperand::CreateReg(PTX::NoRegister, /*IsDef=*/false));
+  //  MI2->addOperand(MachineOperand::CreateImm(PTX::PRED_NORMAL));
+  //}
+  //MI2->dump();
+
+  //MachineOperand ESOp = MachineOperand::CreateES("__local__");
+
   // This frame index is post stack slot re-use assignments
+  //MI.getOperand(Index).ChangeToRegister(Reg, false);
   MI.getOperand(Index).ChangeToImmediate(FrameIndex);
+  //MI.getOperand(Index) = ESOp;
 }
diff --git a/lib/Target/PTX/PTXRegisterInfo.h b/lib/Target/PTX/PTXRegisterInfo.h
index 0b63cb6..55fafe4 100644
--- a/lib/Target/PTX/PTXRegisterInfo.h
+++ b/lib/Target/PTX/PTXRegisterInfo.h
@@ -25,8 +25,12 @@ class PTXTargetMachine;
 class MachineFunction;
 
 struct PTXRegisterInfo : public PTXGenRegisterInfo {
+private:
+  const TargetInstrInfo &TII;
+
+public:
   PTXRegisterInfo(PTXTargetMachine &TM,
-                  const TargetInstrInfo &TII);
+                  const TargetInstrInfo &tii);
 
   virtual const unsigned
     *getCalleeSavedRegs(const MachineFunction *MF = 0) const {
@@ -47,18 +51,6 @@ struct PTXRegisterInfo : public PTXGenRegisterInfo {
     llvm_unreachable("PTX does not have a frame register");
     return 0;
   }
-
-  virtual unsigned getRARegister() const {
-    llvm_unreachable("PTX does not have a return address register");
-    return 0;
-  }
-
-  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const {
-    return PTXGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-  }
-  virtual int getLLVMRegNum(unsigned RegNum, bool isEH) const {
-    return PTXGenRegisterInfo::getLLVMRegNumFull(RegNum, 0);
-  }
 }; // struct PTXRegisterInfo
 } // namespace llvm
 
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
index 1313d24..6ed6d3f 100644
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ b/lib/Target/PTX/PTXRegisterInfo.td
@@ -20,536 +20,18 @@ class PTXReg<string n> : Register<n> {
 //  Registers
 //===----------------------------------------------------------------------===//
 
-///===- Predicate Registers -----------------------------------------------===//
-
-def P0 : PTXReg<"p0">;
-def P1 : PTXReg<"p1">;
-def P2 : PTXReg<"p2">;
-def P3 : PTXReg<"p3">;
-def P4 : PTXReg<"p4">;
-def P5 : PTXReg<"p5">;
-def P6 : PTXReg<"p6">;
-def P7 : PTXReg<"p7">;
-def P8 : PTXReg<"p8">;
-def P9 : PTXReg<"p9">;
-def P10 : PTXReg<"p10">;
-def P11 : PTXReg<"p11">;
-def P12 : PTXReg<"p12">;
-def P13 : PTXReg<"p13">;
-def P14 : PTXReg<"p14">;
-def P15 : PTXReg<"p15">;
-def P16 : PTXReg<"p16">;
-def P17 : PTXReg<"p17">;
-def P18 : PTXReg<"p18">;
-def P19 : PTXReg<"p19">;
-def P20 : PTXReg<"p20">;
-def P21 : PTXReg<"p21">;
-def P22 : PTXReg<"p22">;
-def P23 : PTXReg<"p23">;
-def P24 : PTXReg<"p24">;
-def P25 : PTXReg<"p25">;
-def P26 : PTXReg<"p26">;
-def P27 : PTXReg<"p27">;
-def P28 : PTXReg<"p28">;
-def P29 : PTXReg<"p29">;
-def P30 : PTXReg<"p30">;
-def P31 : PTXReg<"p31">;
-def P32 : PTXReg<"p32">;
-def P33 : PTXReg<"p33">;
-def P34 : PTXReg<"p34">;
-def P35 : PTXReg<"p35">;
-def P36 : PTXReg<"p36">;
-def P37 : PTXReg<"p37">;
-def P38 : PTXReg<"p38">;
-def P39 : PTXReg<"p39">;
-def P40 : PTXReg<"p40">;
-def P41 : PTXReg<"p41">;
-def P42 : PTXReg<"p42">;
-def P43 : PTXReg<"p43">;
-def P44 : PTXReg<"p44">;
-def P45 : PTXReg<"p45">;
-def P46 : PTXReg<"p46">;
-def P47 : PTXReg<"p47">;
-def P48 : PTXReg<"p48">;
-def P49 : PTXReg<"p49">;
-def P50 : PTXReg<"p50">;
-def P51 : PTXReg<"p51">;
-def P52 : PTXReg<"p52">;
-def P53 : PTXReg<"p53">;
-def P54 : PTXReg<"p54">;
-def P55 : PTXReg<"p55">;
-def P56 : PTXReg<"p56">;
-def P57 : PTXReg<"p57">;
-def P58 : PTXReg<"p58">;
-def P59 : PTXReg<"p59">;
-def P60 : PTXReg<"p60">;
-def P61 : PTXReg<"p61">;
-def P62 : PTXReg<"p62">;
-def P63 : PTXReg<"p63">;
-def P64 : PTXReg<"p64">;
-def P65 : PTXReg<"p65">;
-def P66 : PTXReg<"p66">;
-def P67 : PTXReg<"p67">;
-def P68 : PTXReg<"p68">;
-def P69 : PTXReg<"p69">;
-def P70 : PTXReg<"p70">;
-def P71 : PTXReg<"p71">;
-def P72 : PTXReg<"p72">;
-def P73 : PTXReg<"p73">;
-def P74 : PTXReg<"p74">;
-def P75 : PTXReg<"p75">;
-def P76 : PTXReg<"p76">;
-def P77 : PTXReg<"p77">;
-def P78 : PTXReg<"p78">;
-def P79 : PTXReg<"p79">;
-def P80 : PTXReg<"p80">;
-def P81 : PTXReg<"p81">;
-def P82 : PTXReg<"p82">;
-def P83 : PTXReg<"p83">;
-def P84 : PTXReg<"p84">;
-def P85 : PTXReg<"p85">;
-def P86 : PTXReg<"p86">;
-def P87 : PTXReg<"p87">;
-def P88 : PTXReg<"p88">;
-def P89 : PTXReg<"p89">;
-def P90 : PTXReg<"p90">;
-def P91 : PTXReg<"p91">;
-def P92 : PTXReg<"p92">;
-def P93 : PTXReg<"p93">;
-def P94 : PTXReg<"p94">;
-def P95 : PTXReg<"p95">;
-def P96 : PTXReg<"p96">;
-def P97 : PTXReg<"p97">;
-def P98 : PTXReg<"p98">;
-def P99 : PTXReg<"p99">;
-def P100 : PTXReg<"p100">;
-def P101 : PTXReg<"p101">;
-def P102 : PTXReg<"p102">;
-def P103 : PTXReg<"p103">;
-def P104 : PTXReg<"p104">;
-def P105 : PTXReg<"p105">;
-def P106 : PTXReg<"p106">;
-def P107 : PTXReg<"p107">;
-def P108 : PTXReg<"p108">;
-def P109 : PTXReg<"p109">;
-def P110 : PTXReg<"p110">;
-def P111 : PTXReg<"p111">;
-def P112 : PTXReg<"p112">;
-def P113 : PTXReg<"p113">;
-def P114 : PTXReg<"p114">;
-def P115 : PTXReg<"p115">;
-def P116 : PTXReg<"p116">;
-def P117 : PTXReg<"p117">;
-def P118 : PTXReg<"p118">;
-def P119 : PTXReg<"p119">;
-def P120 : PTXReg<"p120">;
-def P121 : PTXReg<"p121">;
-def P122 : PTXReg<"p122">;
-def P123 : PTXReg<"p123">;
-def P124 : PTXReg<"p124">;
-def P125 : PTXReg<"p125">;
-def P126 : PTXReg<"p126">;
-def P127 : PTXReg<"p127">;
-
-///===- 16-Bit Registers --------------------------------------------------===//
-
-def RH0 : PTXReg<"rh0">;
-def RH1 : PTXReg<"rh1">;
-def RH2 : PTXReg<"rh2">;
-def RH3 : PTXReg<"rh3">;
-def RH4 : PTXReg<"rh4">;
-def RH5 : PTXReg<"rh5">;
-def RH6 : PTXReg<"rh6">;
-def RH7 : PTXReg<"rh7">;
-def RH8 : PTXReg<"rh8">;
-def RH9 : PTXReg<"rh9">;
-def RH10 : PTXReg<"rh10">;
-def RH11 : PTXReg<"rh11">;
-def RH12 : PTXReg<"rh12">;
-def RH13 : PTXReg<"rh13">;
-def RH14 : PTXReg<"rh14">;
-def RH15 : PTXReg<"rh15">;
-def RH16 : PTXReg<"rh16">;
-def RH17 : PTXReg<"rh17">;
-def RH18 : PTXReg<"rh18">;
-def RH19 : PTXReg<"rh19">;
-def RH20 : PTXReg<"rh20">;
-def RH21 : PTXReg<"rh21">;
-def RH22 : PTXReg<"rh22">;
-def RH23 : PTXReg<"rh23">;
-def RH24 : PTXReg<"rh24">;
-def RH25 : PTXReg<"rh25">;
-def RH26 : PTXReg<"rh26">;
-def RH27 : PTXReg<"rh27">;
-def RH28 : PTXReg<"rh28">;
-def RH29 : PTXReg<"rh29">;
-def RH30 : PTXReg<"rh30">;
-def RH31 : PTXReg<"rh31">;
-def RH32 : PTXReg<"rh32">;
-def RH33 : PTXReg<"rh33">;
-def RH34 : PTXReg<"rh34">;
-def RH35 : PTXReg<"rh35">;
-def RH36 : PTXReg<"rh36">;
-def RH37 : PTXReg<"rh37">;
-def RH38 : PTXReg<"rh38">;
-def RH39 : PTXReg<"rh39">;
-def RH40 : PTXReg<"rh40">;
-def RH41 : PTXReg<"rh41">;
-def RH42 : PTXReg<"rh42">;
-def RH43 : PTXReg<"rh43">;
-def RH44 : PTXReg<"rh44">;
-def RH45 : PTXReg<"rh45">;
-def RH46 : PTXReg<"rh46">;
-def RH47 : PTXReg<"rh47">;
-def RH48 : PTXReg<"rh48">;
-def RH49 : PTXReg<"rh49">;
-def RH50 : PTXReg<"rh50">;
-def RH51 : PTXReg<"rh51">;
-def RH52 : PTXReg<"rh52">;
-def RH53 : PTXReg<"rh53">;
-def RH54 : PTXReg<"rh54">;
-def RH55 : PTXReg<"rh55">;
-def RH56 : PTXReg<"rh56">;
-def RH57 : PTXReg<"rh57">;
-def RH58 : PTXReg<"rh58">;
-def RH59 : PTXReg<"rh59">;
-def RH60 : PTXReg<"rh60">;
-def RH61 : PTXReg<"rh61">;
-def RH62 : PTXReg<"rh62">;
-def RH63 : PTXReg<"rh63">;
-def RH64 : PTXReg<"rh64">;
-def RH65 : PTXReg<"rh65">;
-def RH66 : PTXReg<"rh66">;
-def RH67 : PTXReg<"rh67">;
-def RH68 : PTXReg<"rh68">;
-def RH69 : PTXReg<"rh69">;
-def RH70 : PTXReg<"rh70">;
-def RH71 : PTXReg<"rh71">;
-def RH72 : PTXReg<"rh72">;
-def RH73 : PTXReg<"rh73">;
-def RH74 : PTXReg<"rh74">;
-def RH75 : PTXReg<"rh75">;
-def RH76 : PTXReg<"rh76">;
-def RH77 : PTXReg<"rh77">;
-def RH78 : PTXReg<"rh78">;
-def RH79 : PTXReg<"rh79">;
-def RH80 : PTXReg<"rh80">;
-def RH81 : PTXReg<"rh81">;
-def RH82 : PTXReg<"rh82">;
-def RH83 : PTXReg<"rh83">;
-def RH84 : PTXReg<"rh84">;
-def RH85 : PTXReg<"rh85">;
-def RH86 : PTXReg<"rh86">;
-def RH87 : PTXReg<"rh87">;
-def RH88 : PTXReg<"rh88">;
-def RH89 : PTXReg<"rh89">;
-def RH90 : PTXReg<"rh90">;
-def RH91 : PTXReg<"rh91">;
-def RH92 : PTXReg<"rh92">;
-def RH93 : PTXReg<"rh93">;
-def RH94 : PTXReg<"rh94">;
-def RH95 : PTXReg<"rh95">;
-def RH96 : PTXReg<"rh96">;
-def RH97 : PTXReg<"rh97">;
-def RH98 : PTXReg<"rh98">;
-def RH99 : PTXReg<"rh99">;
-def RH100 : PTXReg<"rh100">;
-def RH101 : PTXReg<"rh101">;
-def RH102 : PTXReg<"rh102">;
-def RH103 : PTXReg<"rh103">;
-def RH104 : PTXReg<"rh104">;
-def RH105 : PTXReg<"rh105">;
-def RH106 : PTXReg<"rh106">;
-def RH107 : PTXReg<"rh107">;
-def RH108 : PTXReg<"rh108">;
-def RH109 : PTXReg<"rh109">;
-def RH110 : PTXReg<"rh110">;
-def RH111 : PTXReg<"rh111">;
-def RH112 : PTXReg<"rh112">;
-def RH113 : PTXReg<"rh113">;
-def RH114 : PTXReg<"rh114">;
-def RH115 : PTXReg<"rh115">;
-def RH116 : PTXReg<"rh116">;
-def RH117 : PTXReg<"rh117">;
-def RH118 : PTXReg<"rh118">;
-def RH119 : PTXReg<"rh119">;
-def RH120 : PTXReg<"rh120">;
-def RH121 : PTXReg<"rh121">;
-def RH122 : PTXReg<"rh122">;
-def RH123 : PTXReg<"rh123">;
-def RH124 : PTXReg<"rh124">;
-def RH125 : PTXReg<"rh125">;
-def RH126 : PTXReg<"rh126">;
-def RH127 : PTXReg<"rh127">;
-
-///===- 32-Bit Registers --------------------------------------------------===//
-
-def R0 : PTXReg<"r0">;
-def R1 : PTXReg<"r1">;
-def R2 : PTXReg<"r2">;
-def R3 : PTXReg<"r3">;
-def R4 : PTXReg<"r4">;
-def R5 : PTXReg<"r5">;
-def R6 : PTXReg<"r6">;
-def R7 : PTXReg<"r7">;
-def R8 : PTXReg<"r8">;
-def R9 : PTXReg<"r9">;
-def R10 : PTXReg<"r10">;
-def R11 : PTXReg<"r11">;
-def R12 : PTXReg<"r12">;
-def R13 : PTXReg<"r13">;
-def R14 : PTXReg<"r14">;
-def R15 : PTXReg<"r15">;
-def R16 : PTXReg<"r16">;
-def R17 : PTXReg<"r17">;
-def R18 : PTXReg<"r18">;
-def R19 : PTXReg<"r19">;
-def R20 : PTXReg<"r20">;
-def R21 : PTXReg<"r21">;
-def R22 : PTXReg<"r22">;
-def R23 : PTXReg<"r23">;
-def R24 : PTXReg<"r24">;
-def R25 : PTXReg<"r25">;
-def R26 : PTXReg<"r26">;
-def R27 : PTXReg<"r27">;
-def R28 : PTXReg<"r28">;
-def R29 : PTXReg<"r29">;
-def R30 : PTXReg<"r30">;
-def R31 : PTXReg<"r31">;
-def R32 : PTXReg<"r32">;
-def R33 : PTXReg<"r33">;
-def R34 : PTXReg<"r34">;
-def R35 : PTXReg<"r35">;
-def R36 : PTXReg<"r36">;
-def R37 : PTXReg<"r37">;
-def R38 : PTXReg<"r38">;
-def R39 : PTXReg<"r39">;
-def R40 : PTXReg<"r40">;
-def R41 : PTXReg<"r41">;
-def R42 : PTXReg<"r42">;
-def R43 : PTXReg<"r43">;
-def R44 : PTXReg<"r44">;
-def R45 : PTXReg<"r45">;
-def R46 : PTXReg<"r46">;
-def R47 : PTXReg<"r47">;
-def R48 : PTXReg<"r48">;
-def R49 : PTXReg<"r49">;
-def R50 : PTXReg<"r50">;
-def R51 : PTXReg<"r51">;
-def R52 : PTXReg<"r52">;
-def R53 : PTXReg<"r53">;
-def R54 : PTXReg<"r54">;
-def R55 : PTXReg<"r55">;
-def R56 : PTXReg<"r56">;
-def R57 : PTXReg<"r57">;
-def R58 : PTXReg<"r58">;
-def R59 : PTXReg<"r59">;
-def R60 : PTXReg<"r60">;
-def R61 : PTXReg<"r61">;
-def R62 : PTXReg<"r62">;
-def R63 : PTXReg<"r63">;
-def R64 : PTXReg<"r64">;
-def R65 : PTXReg<"r65">;
-def R66 : PTXReg<"r66">;
-def R67 : PTXReg<"r67">;
-def R68 : PTXReg<"r68">;
-def R69 : PTXReg<"r69">;
-def R70 : PTXReg<"r70">;
-def R71 : PTXReg<"r71">;
-def R72 : PTXReg<"r72">;
-def R73 : PTXReg<"r73">;
-def R74 : PTXReg<"r74">;
-def R75 : PTXReg<"r75">;
-def R76 : PTXReg<"r76">;
-def R77 : PTXReg<"r77">;
-def R78 : PTXReg<"r78">;
-def R79 : PTXReg<"r79">;
-def R80 : PTXReg<"r80">;
-def R81 : PTXReg<"r81">;
-def R82 : PTXReg<"r82">;
-def R83 : PTXReg<"r83">;
-def R84 : PTXReg<"r84">;
-def R85 : PTXReg<"r85">;
-def R86 : PTXReg<"r86">;
-def R87 : PTXReg<"r87">;
-def R88 : PTXReg<"r88">;
-def R89 : PTXReg<"r89">;
-def R90 : PTXReg<"r90">;
-def R91 : PTXReg<"r91">;
-def R92 : PTXReg<"r92">;
-def R93 : PTXReg<"r93">;
-def R94 : PTXReg<"r94">;
-def R95 : PTXReg<"r95">;
-def R96 : PTXReg<"r96">;
-def R97 : PTXReg<"r97">;
-def R98 : PTXReg<"r98">;
-def R99 : PTXReg<"r99">;
-def R100 : PTXReg<"r100">;
-def R101 : PTXReg<"r101">;
-def R102 : PTXReg<"r102">;
-def R103 : PTXReg<"r103">;
-def R104 : PTXReg<"r104">;
-def R105 : PTXReg<"r105">;
-def R106 : PTXReg<"r106">;
-def R107 : PTXReg<"r107">;
-def R108 : PTXReg<"r108">;
-def R109 : PTXReg<"r109">;
-def R110 : PTXReg<"r110">;
-def R111 : PTXReg<"r111">;
-def R112 : PTXReg<"r112">;
-def R113 : PTXReg<"r113">;
-def R114 : PTXReg<"r114">;
-def R115 : PTXReg<"r115">;
-def R116 : PTXReg<"r116">;
-def R117 : PTXReg<"r117">;
-def R118 : PTXReg<"r118">;
-def R119 : PTXReg<"r119">;
-def R120 : PTXReg<"r120">;
-def R121 : PTXReg<"r121">;
-def R122 : PTXReg<"r122">;
-def R123 : PTXReg<"r123">;
-def R124 : PTXReg<"r124">;
-def R125 : PTXReg<"r125">;
-def R126 : PTXReg<"r126">;
-def R127 : PTXReg<"r127">;
-
-///===- 64-Bit Registers --------------------------------------------------===//
-
-def RD0 : PTXReg<"rd0">;
-def RD1 : PTXReg<"rd1">;
-def RD2 : PTXReg<"rd2">;
-def RD3 : PTXReg<"rd3">;
-def RD4 : PTXReg<"rd4">;
-def RD5 : PTXReg<"rd5">;
-def RD6 : PTXReg<"rd6">;
-def RD7 : PTXReg<"rd7">;
-def RD8 : PTXReg<"rd8">;
-def RD9 : PTXReg<"rd9">;
-def RD10 : PTXReg<"rd10">;
-def RD11 : PTXReg<"rd11">;
-def RD12 : PTXReg<"rd12">;
-def RD13 : PTXReg<"rd13">;
-def RD14 : PTXReg<"rd14">;
-def RD15 : PTXReg<"rd15">;
-def RD16 : PTXReg<"rd16">;
-def RD17 : PTXReg<"rd17">;
-def RD18 : PTXReg<"rd18">;
-def RD19 : PTXReg<"rd19">;
-def RD20 : PTXReg<"rd20">;
-def RD21 : PTXReg<"rd21">;
-def RD22 : PTXReg<"rd22">;
-def RD23 : PTXReg<"rd23">;
-def RD24 : PTXReg<"rd24">;
-def RD25 : PTXReg<"rd25">;
-def RD26 : PTXReg<"rd26">;
-def RD27 : PTXReg<"rd27">;
-def RD28 : PTXReg<"rd28">;
-def RD29 : PTXReg<"rd29">;
-def RD30 : PTXReg<"rd30">;
-def RD31 : PTXReg<"rd31">;
-def RD32 : PTXReg<"rd32">;
-def RD33 : PTXReg<"rd33">;
-def RD34 : PTXReg<"rd34">;
-def RD35 : PTXReg<"rd35">;
-def RD36 : PTXReg<"rd36">;
-def RD37 : PTXReg<"rd37">;
-def RD38 : PTXReg<"rd38">;
-def RD39 : PTXReg<"rd39">;
-def RD40 : PTXReg<"rd40">;
-def RD41 : PTXReg<"rd41">;
-def RD42 : PTXReg<"rd42">;
-def RD43 : PTXReg<"rd43">;
-def RD44 : PTXReg<"rd44">;
-def RD45 : PTXReg<"rd45">;
-def RD46 : PTXReg<"rd46">;
-def RD47 : PTXReg<"rd47">;
-def RD48 : PTXReg<"rd48">;
-def RD49 : PTXReg<"rd49">;
-def RD50 : PTXReg<"rd50">;
-def RD51 : PTXReg<"rd51">;
-def RD52 : PTXReg<"rd52">;
-def RD53 : PTXReg<"rd53">;
-def RD54 : PTXReg<"rd54">;
-def RD55 : PTXReg<"rd55">;
-def RD56 : PTXReg<"rd56">;
-def RD57 : PTXReg<"rd57">;
-def RD58 : PTXReg<"rd58">;
-def RD59 : PTXReg<"rd59">;
-def RD60 : PTXReg<"rd60">;
-def RD61 : PTXReg<"rd61">;
-def RD62 : PTXReg<"rd62">;
-def RD63 : PTXReg<"rd63">;
-def RD64 : PTXReg<"rd64">;
-def RD65 : PTXReg<"rd65">;
-def RD66 : PTXReg<"rd66">;
-def RD67 : PTXReg<"rd67">;
-def RD68 : PTXReg<"rd68">;
-def RD69 : PTXReg<"rd69">;
-def RD70 : PTXReg<"rd70">;
-def RD71 : PTXReg<"rd71">;
-def RD72 : PTXReg<"rd72">;
-def RD73 : PTXReg<"rd73">;
-def RD74 : PTXReg<"rd74">;
-def RD75 : PTXReg<"rd75">;
-def RD76 : PTXReg<"rd76">;
-def RD77 : PTXReg<"rd77">;
-def RD78 : PTXReg<"rd78">;
-def RD79 : PTXReg<"rd79">;
-def RD80 : PTXReg<"rd80">;
-def RD81 : PTXReg<"rd81">;
-def RD82 : PTXReg<"rd82">;
-def RD83 : PTXReg<"rd83">;
-def RD84 : PTXReg<"rd84">;
-def RD85 : PTXReg<"rd85">;
-def RD86 : PTXReg<"rd86">;
-def RD87 : PTXReg<"rd87">;
-def RD88 : PTXReg<"rd88">;
-def RD89 : PTXReg<"rd89">;
-def RD90 : PTXReg<"rd90">;
-def RD91 : PTXReg<"rd91">;
-def RD92 : PTXReg<"rd92">;
-def RD93 : PTXReg<"rd93">;
-def RD94 : PTXReg<"rd94">;
-def RD95 : PTXReg<"rd95">;
-def RD96 : PTXReg<"rd96">;
-def RD97 : PTXReg<"rd97">;
-def RD98 : PTXReg<"rd98">;
-def RD99 : PTXReg<"rd99">;
-def RD100 : PTXReg<"rd100">;
-def RD101 : PTXReg<"rd101">;
-def RD102 : PTXReg<"rd102">;
-def RD103 : PTXReg<"rd103">;
-def RD104 : PTXReg<"rd104">;
-def RD105 : PTXReg<"rd105">;
-def RD106 : PTXReg<"rd106">;
-def RD107 : PTXReg<"rd107">;
-def RD108 : PTXReg<"rd108">;
-def RD109 : PTXReg<"rd109">;
-def RD110 : PTXReg<"rd110">;
-def RD111 : PTXReg<"rd111">;
-def RD112 : PTXReg<"rd112">;
-def RD113 : PTXReg<"rd113">;
-def RD114 : PTXReg<"rd114">;
-def RD115 : PTXReg<"rd115">;
-def RD116 : PTXReg<"rd116">;
-def RD117 : PTXReg<"rd117">;
-def RD118 : PTXReg<"rd118">;
-def RD119 : PTXReg<"rd119">;
-def RD120 : PTXReg<"rd120">;
-def RD121 : PTXReg<"rd121">;
-def RD122 : PTXReg<"rd122">;
-def RD123 : PTXReg<"rd123">;
-def RD124 : PTXReg<"rd124">;
-def RD125 : PTXReg<"rd125">;
-def RD126 : PTXReg<"rd126">;
-def RD127 : PTXReg<"rd127">;
+// The generated register info code throws warnings for empty register classes
+// (e.g. zero-length arrays), so we use a dummy register here just to prevent
+// these warnings.
+def DUMMY_REG : PTXReg<"R0">;
 
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
-def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%u", 0, 127)>;
-def RegI16 : RegisterClass<"PTX", [i16], 16, (sequence "RH%u", 0, 127)>;
-def RegI32 : RegisterClass<"PTX", [i32], 32, (sequence "R%u", 0, 127)>;
-def RegI64 : RegisterClass<"PTX", [i64], 64, (sequence "RD%u", 0, 127)>;
-def RegF32 : RegisterClass<"PTX", [f32], 32, (sequence "R%u", 0, 127)>;
-def RegF64 : RegisterClass<"PTX", [f64], 64, (sequence "RD%u", 0, 127)>;
+def RegPred : RegisterClass<"PTX", [i1], 8, (add DUMMY_REG)>;
+def RegI16 : RegisterClass<"PTX", [i16], 16, (add DUMMY_REG)>;
+def RegI32 : RegisterClass<"PTX", [i32], 32, (add DUMMY_REG)>;
+def RegI64 : RegisterClass<"PTX", [i64], 64, (add DUMMY_REG)>;
+def RegF32 : RegisterClass<"PTX", [f32], 32, (add DUMMY_REG)>;
+def RegF64 : RegisterClass<"PTX", [f64], 64, (add DUMMY_REG)>;
+
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.cpp b/lib/Target/PTX/PTXSelectionDAGInfo.cpp
new file mode 100644
index 0000000..50ef14a
--- /dev/null
+++ b/lib/Target/PTX/PTXSelectionDAGInfo.cpp
@@ -0,0 +1,149 @@
+//===-- PTXSelectionDAGInfo.cpp - PTX SelectionDAG Info -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PTXSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ptx-selectiondag-info"
+#include "PTXTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+using namespace llvm;
+
+PTXSelectionDAGInfo::PTXSelectionDAGInfo(const TargetMachine &TM)
+  : TargetSelectionDAGInfo(TM),
+    Subtarget(&TM.getSubtarget<PTXSubtarget>()) {
+}
+
+PTXSelectionDAGInfo::~PTXSelectionDAGInfo() {
+}
+
+SDValue
+PTXSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                             SDValue Chain,
+                                             SDValue Dst, SDValue Src,
+                                             SDValue Size, unsigned Align,
+                                             bool isVolatile, bool AlwaysInline,
+                                             MachinePointerInfo DstPtrInfo,
+                                          MachinePointerInfo SrcPtrInfo) const {
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  // Always inline memcpys. In PTX, we do not have a C library that provides
+  // a memcpy function.
+  //if (!AlwaysInline)
+  //  return SDValue();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  EVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  const unsigned MAX_LOADS_IN_LDM = 6;
+  SDValue TFOps[MAX_LOADS_IN_LDM];
+  SDValue Loads[MAX_LOADS_IN_LDM];
+  uint64_t SrcOff = 0, DstOff = 0;
+  EVT PointerType = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
+  // same number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while (EmittedNumMemOps < NumMemOps) {
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      Loads[i] = DAG.getLoad(VT, dl, Chain,
+                             DAG.getNode(ISD::ADD, dl, PointerType, Src,
+                                         DAG.getConstant(SrcOff, PointerType)),
+                             SrcPtrInfo.getWithOffset(SrcOff), isVolatile,
+                             false, 0);
+      TFOps[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                              DAG.getNode(ISD::ADD, dl, PointerType, Dst,
+                                          DAG.getConstant(DstOff, PointerType)),
+                              DstPtrInfo.getWithOffset(DstOff),
+                              isVolatile, false, 0);
+      DstOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    EmittedNumMemOps += i;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, PointerType, Src,
+                                       DAG.getConstant(SrcOff, PointerType)),
+                           SrcPtrInfo.getWithOffset(SrcOff), false, false, 0);
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, PointerType, Dst,
+                                        DAG.getConstant(DstOff, PointerType)),
+                            DstPtrInfo.getWithOffset(DstOff), false, false, 0);
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+}
+
+SDValue PTXSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                        SDValue Chain, SDValue Dst,
+                        SDValue Src, SDValue Size,
+                        unsigned Align, bool isVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
+  llvm_unreachable("memset lowering not implemented for PTX yet");
+}
+
diff --git a/lib/Target/PTX/PTXSelectionDAGInfo.h b/lib/Target/PTX/PTXSelectionDAGInfo.h
new file mode 100644
index 0000000..e0c7167
--- /dev/null
+++ b/lib/Target/PTX/PTXSelectionDAGInfo.h
@@ -0,0 +1,53 @@
+//===-- PTXSelectionDAGInfo.h - PTX SelectionDAG Info -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PTX subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PTXSELECTIONDAGINFO_H
+#define PTXSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+/// PTXSelectionDAGInfo - TargetSelectionDAGInfo sub-class for the PTX target.
+/// At the moment, this is mostly just a copy of ARMSelectionDAGInfo.
+class PTXSelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the PTXSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const PTXSubtarget *Subtarget;
+
+public:
+  explicit PTXSelectionDAGInfo(const TargetMachine &TM);
+  ~PTXSelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool isVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const;
+
+  virtual
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                  SDValue Chain,
+                                  SDValue Op1, SDValue Op2,
+                                  SDValue Op3, unsigned Align,
+                                  bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const;
+};
+
+}
+
+#endif
+
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
index 8ec646e..1eb57d2 100644
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ b/lib/Target/PTX/PTXSubtarget.cpp
@@ -14,7 +14,7 @@
 #include "PTXSubtarget.h"
 #include "PTX.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
index 0921f1f..b946d7c 100644
--- a/lib/Target/PTX/PTXSubtarget.h
+++ b/lib/Target/PTX/PTXSubtarget.h
@@ -114,7 +114,16 @@ class StringRef;
                (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
       }
 
-    void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+      bool callsAreHandled() const {
+        return (PTXTarget >= PTX_SM_2_0 && PTXTarget < PTX_LAST_SM) ||
+               (PTXTarget >= PTX_COMPUTE_2_0 && PTXTarget < PTX_LAST_COMPUTE);
+      }
+
+      bool emitPtrAttribute() const {
+        return PTXVersion >= PTX_VERSION_2_2;
+      }
+
+      void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
   }; // class PTXSubtarget
 } // namespace llvm
 
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index ab926e0..449a3d9 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -14,8 +14,32 @@
 #include "PTX.h"
 #include "PTXTargetMachine.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/TargetRegistry.h"
+
 
 using namespace llvm;
 
@@ -25,7 +49,7 @@ namespace llvm {
                                    bool useCFI,
                                    MCInstPrinter *InstPrint,
                                    MCCodeEmitter *CE,
-                                   TargetAsmBackend *TAB,
+                                   MCAsmBackend *MAB,
                                    bool ShowInst);
 }
 
@@ -43,34 +67,47 @@ namespace {
     "e-p:32:32-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
   const char* DataLayout64 =
     "e-p:64:64-i64:32:32-f64:32:32-v128:32:128-v64:32:64-n32:64";
+
+  // Copied from LLVMTargetMachine.cpp
+  void printNoVerify(PassManagerBase &PM, const char *Banner) {
+    if (PrintMachineCode)
+      PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
+  }
+
+  void printAndVerify(PassManagerBase &PM,
+                      const char *Banner) {
+    if (PrintMachineCode)
+      PM.add(createMachineFunctionPrinterPass(dbgs(), Banner));
+
+    //if (VerifyMachineCode)
+    //  PM.add(createMachineVerifierPass(Banner));
+  }
 }
 
 // DataLayout and FrameLowering are filled with dummy data
 PTXTargetMachine::PTXTargetMachine(const Target &T,
-                                   const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS,
+                                   StringRef TT, StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     DataLayout(is64Bit ? DataLayout64 : DataLayout32),
     Subtarget(TT, CPU, FS, is64Bit),
     FrameLowering(Subtarget),
     InstrInfo(*this),
+    TSInfo(*this),
     TLInfo(*this) {
 }
 
-PTX32TargetMachine::PTX32TargetMachine(const Target &T,
-                                       const std::string& TT,
-                                       const std::string& CPU,
-                                       const std::string& FS)
-  : PTXTargetMachine(T, TT, CPU, FS, false) {
+PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, false) {
 }
 
-PTX64TargetMachine::PTX64TargetMachine(const Target &T,
-                                       const std::string& TT,
-                                       const std::string& CPU,
-                                       const std::string& FS)
-  : PTXTargetMachine(T, TT, CPU, FS, true) {
+PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, true) {
 }
 
 bool PTXTargetMachine::addInstSelector(PassManagerBase &PM,
@@ -82,6 +119,255 @@ bool PTXTargetMachine::addInstSelector(PassManagerBase &PM,
 bool PTXTargetMachine::addPostRegAlloc(PassManagerBase &PM,
                                        CodeGenOpt::Level OptLevel) {
   // PTXMFInfoExtract must after register allocation!
+  //PM.add(createPTXMFInfoExtract(*this, OptLevel));
+  return false;
+}
+
+bool PTXTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                           formatted_raw_ostream &Out,
+                                           CodeGenFileType FileType,
+                                           CodeGenOpt::Level OptLevel,
+                                           bool DisableVerify) {
+  // This is mostly based on LLVMTargetMachine::addPassesToEmitFile
+
+  // Add common CodeGen passes.
+  MCContext *Context = 0;
+  if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Context))
+    return true;
+  assert(Context != 0 && "Failed to get MCContext");
+
+  if (hasMCSaveTempLabels())
+    Context->setAllowTemporaryLabels(false);
+
+  const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
+  OwningPtr<MCStreamer> AsmStreamer;
+
+  switch (FileType) {
+  default: return true;
+  case CGFT_AssemblyFile: {
+    MCInstPrinter *InstPrinter =
+      getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI, STI);
+
+    // Create a code emitter if asked to show the encoding.
+    MCCodeEmitter *MCE = 0;
+    MCAsmBackend *MAB = 0;
+
+    MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
+                                                  true, /* verbose asm */
+                                                  hasMCUseLoc(),
+                                                  hasMCUseCFI(),
+                                                  InstPrinter,
+                                                  MCE, MAB,
+                                                  false /* show MC encoding */);
+    AsmStreamer.reset(S);
+    break;
+  }
+  case CGFT_ObjectFile: {
+    llvm_unreachable("Object file emission is not supported with PTX");
+  }
+  case CGFT_Null:
+    // The Null output is intended for use for performance analysis and testing,
+    // not real users.
+    AsmStreamer.reset(createNullStreamer(*Context));
+    break;
+  }
+
+  // MC Logging
+  //AsmStreamer.reset(createLoggingStreamer(AsmStreamer.take(), errs()));
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  if (Printer == 0)
+    return true;
+
+  // If successful, createAsmPrinter took ownership of AsmStreamer.
+  AsmStreamer.take();
+
+  PM.add(Printer);
+
+  PM.add(createGCInfoDeleter());
+  return false;
+}
+
+bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
+                                              CodeGenOpt::Level OptLevel,
+                                              bool DisableVerify,
+                                              MCContext *&OutContext) {
+  // Add standard LLVM codegen passes.
+  // This is derived from LLVMTargetMachine::addCommonCodeGenPasses, with some
+  // modifications for the PTX target.
+
+  // Standard LLVM-Level Passes.
+
+  // Basic AliasAnalysis support.
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createBasicAliasAnalysisPass());
+
+  // Before running any passes, run the verifier to determine if the input
+  // coming from the front-end and/or optimizer is valid.
+  if (!DisableVerify)
+    PM.add(createVerifierPass());
+
+  // Run loop strength reduction before anything else.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createLoopStrengthReducePass(getTargetLowering()));
+    //PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
+  }
+
+  PM.add(createGCLoweringPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  PM.add(createUnreachableBlockEliminationPass());
+
+  PM.add(createLowerInvokePass(getTargetLowering()));
+  // The lower invoke pass may create unreachable code. Remove it.
+  PM.add(createUnreachableBlockEliminationPass());
+
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createCodeGenPreparePass(getTargetLowering()));
+
+  PM.add(createStackProtectorPass(getTargetLowering()));
+
+  addPreISel(PM, OptLevel);
+
+  //PM.add(createPrintFunctionPass("\n\n"
+  //                               "*** Final LLVM Code input to ISel ***\n",
+  //                               &dbgs()));
+
+  // All passes which modify the LLVM IR are now complete; run the verifier
+  // to ensure that the IR is valid.
+  if (!DisableVerify)
+    PM.add(createVerifierPass());
+
+  // Standard Lower-Level Passes.
+
+  // Install a MachineModuleInfo class, which is an immutable pass that holds
+  // all the per-module stuff we're generating, including MCContext.
+  MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo(),
+                                                 *getRegisterInfo(),
+                                    &getTargetLowering()->getObjFileLowering());
+  PM.add(MMI);
+  OutContext = &MMI->getContext(); // Return the MCContext specifically by-ref.
+
+  // Set up a MachineFunction for the rest of CodeGen to work on.
+  PM.add(new MachineFunctionAnalysis(*this, OptLevel));
+
+  // Ask the target for an isel.
+  if (addInstSelector(PM, OptLevel))
+    return true;
+
+  // Print the instruction selected machine code...
+  printAndVerify(PM, "After Instruction Selection");
+
+  // Expand pseudo-instructions emitted by ISel.
+  PM.add(createExpandISelPseudosPass());
+
+  // Pre-ra tail duplication.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createTailDuplicatePass(true));
+    printAndVerify(PM, "After Pre-RegAlloc TailDuplicate");
+  }
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  if (OptLevel != CodeGenOpt::None)
+    PM.add(createOptimizePHIsPass());
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  PM.add(createLocalStackSlotAllocationPass());
+
+  if (OptLevel != CodeGenOpt::None) {
+    // With optimization, dead code should already be eliminated. However
+    // there is one known exception: lowered code for arguments that are only
+    // used by tail calls, where the tail calls reuse the incoming stack
+    // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+    PM.add(createDeadMachineInstructionElimPass());
+    printAndVerify(PM, "After codegen DCE pass");
+
+    PM.add(createMachineLICMPass());
+    PM.add(createMachineCSEPass());
+    PM.add(createMachineSinkingPass());
+    printAndVerify(PM, "After Machine LICM, CSE and Sinking passes");
+
+    PM.add(createPeepholeOptimizerPass());
+    printAndVerify(PM, "After codegen peephole optimization pass");
+  }
+
+  // Run pre-ra passes.
+  if (addPreRegAlloc(PM, OptLevel))
+    printAndVerify(PM, "After PreRegAlloc passes");
+
+  // Perform register allocation.
+  PM.add(createPTXRegisterAllocator());
+  printAndVerify(PM, "After Register Allocation");
+
+  // Perform stack slot coloring and post-ra machine LICM.
+  if (OptLevel != CodeGenOpt::None) {
+    // FIXME: Re-enable coloring with register when it's capable of adding
+    // kill markers.
+    PM.add(createStackSlotColoringPass(false));
+
+    // FIXME: Post-RA LICM has asserts that fire on virtual registers.
+    // Run post-ra machine LICM to hoist reloads / remats.
+    //if (!DisablePostRAMachineLICM)
+    //  PM.add(createMachineLICMPass(false));
+
+    printAndVerify(PM, "After StackSlotColoring and postra Machine LICM");
+  }
+
+  // Run post-ra passes.
+  if (addPostRegAlloc(PM, OptLevel))
+    printAndVerify(PM, "After PostRegAlloc passes");
+
+  PM.add(createExpandPostRAPseudosPass());
+  printAndVerify(PM, "After ExpandPostRAPseudos");
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  PM.add(createPrologEpilogCodeInserter());
+  printAndVerify(PM, "After PrologEpilogCodeInserter");
+
+  // Run pre-sched2 passes.
+  if (addPreSched2(PM, OptLevel))
+    printAndVerify(PM, "After PreSched2 passes");
+
+  // Second pass scheduler.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createPostRAScheduler(OptLevel));
+    printAndVerify(PM, "After PostRAScheduler");
+  }
+
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createBranchFoldingPass(getEnableTailMergeDefault()));
+    printNoVerify(PM, "After BranchFolding");
+  }
+
+  // Tail duplication.
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createTailDuplicatePass(false));
+    printNoVerify(PM, "After TailDuplicate");
+  }
+
+  PM.add(createGCMachineCodeAnalysisPass());
+
+  //if (PrintGCInfo)
+  //  PM.add(createGCInfoPrinter(dbgs()));
+
+  if (OptLevel != CodeGenOpt::None) {
+    PM.add(createCodePlacementOptPass());
+    printNoVerify(PM, "After CodePlacementOpt");
+  }
+
+  if (addPreEmitPass(PM, OptLevel))
+    printNoVerify(PM, "After PreEmit passes");
+
   PM.add(createPTXMFInfoExtract(*this, OptLevel));
+  PM.add(createPTXFPRoundingModePass(*this, OptLevel));
+
   return false;
 }
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
index ae42153..5b7c82b 100644
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ b/lib/Target/PTX/PTXTargetMachine.h
@@ -17,6 +17,7 @@
 #include "PTXISelLowering.h"
 #include "PTXInstrInfo.h"
 #include "PTXFrameLowering.h"
+#include "PTXSelectionDAGInfo.h"
 #include "PTXSubtarget.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -25,15 +26,17 @@
 namespace llvm {
 class PTXTargetMachine : public LLVMTargetMachine {
   private:
-    const TargetData  DataLayout;
-    PTXSubtarget      Subtarget; // has to be initialized before FrameLowering
-    PTXFrameLowering  FrameLowering;
-    PTXInstrInfo      InstrInfo;
-    PTXTargetLowering TLInfo;
+    const TargetData    DataLayout;
+    PTXSubtarget        Subtarget; // has to be initialized before FrameLowering
+    PTXFrameLowering    FrameLowering;
+    PTXInstrInfo        InstrInfo;
+    PTXSelectionDAGInfo TSInfo;
+    PTXTargetLowering   TLInfo;
 
   public:
-    PTXTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS,
+    PTXTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM,
                      bool is64Bit);
 
     virtual const TargetData *getTargetData() const { return &DataLayout; }
@@ -49,27 +52,62 @@ class PTXTargetMachine : public LLVMTargetMachine {
     virtual const PTXTargetLowering *getTargetLowering() const {
       return &TLInfo; }
 
+    virtual const PTXSelectionDAGInfo* getSelectionDAGInfo() const {
+      return &TSInfo;
+    }
+
     virtual const PTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
 
     virtual bool addInstSelector(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
     virtual bool addPostRegAlloc(PassManagerBase &PM,
                                  CodeGenOpt::Level OptLevel);
+
+    // We override this method to supply our own set of codegen passes.
+    virtual bool addPassesToEmitFile(PassManagerBase &,
+                                     formatted_raw_ostream &,
+                                     CodeGenFileType,
+                                     CodeGenOpt::Level,
+                                     bool = true);
+
+    // Emission of machine code through JITCodeEmitter is not supported.
+    virtual bool addPassesToEmitMachineCode(PassManagerBase &,
+                                            JITCodeEmitter &,
+                                            CodeGenOpt::Level,
+                                            bool = true) {
+      return true;
+    }
+
+    // Emission of machine code through MCJIT is not supported.
+    virtual bool addPassesToEmitMC(PassManagerBase &,
+                                   MCContext *&,
+                                   raw_ostream &,
+                                   CodeGenOpt::Level,
+                                   bool = true) {
+      return true;
+    }
+
+  private:
+
+    bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+                                bool DisableVerify, MCContext *&OutCtx);
 }; // class PTXTargetMachine
 
 
 class PTX32TargetMachine : public PTXTargetMachine {
 public:
 
-  PTX32TargetMachine(const Target &T, const std::string &TT,
-                     const std::string& CPU, const std::string& FS);
+  PTX32TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 }; // class PTX32TargetMachine
 
 class PTX64TargetMachine : public PTXTargetMachine {
 public:
 
-  PTX64TargetMachine(const Target &T, const std::string &TT,
-                     const std::string& CPU, const std::string& FS);
+  PTX64TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 }; // class PTX32TargetMachine
 
 } // namespace llvm
diff --git a/lib/Target/PTX/TargetInfo/CMakeLists.txt b/lib/Target/PTX/TargetInfo/CMakeLists.txt
index 4b09cf5..2366e45 100644
--- a/lib/Target/PTX/TargetInfo/CMakeLists.txt
+++ b/lib/Target/PTX/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMPTXInfo
   PTXTargetInfo.cpp
   )
 
-add_dependencies(LLVMPTXInfo PTXCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMPTXInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMPTXInfo PTXCommonTableGen)
diff --git a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
index 9df6c75..09a2735 100644
--- a/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
+++ b/lib/Target/PTX/TargetInfo/PTXTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "PTX.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/PTX/generate-register-td.py b/lib/Target/PTX/generate-register-td.py
deleted file mode 100755
index 1528690..0000000
--- a/lib/Target/PTX/generate-register-td.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python
-##===- generate-register-td.py --------------------------------*-python-*--===##
-##
-##                     The LLVM Compiler Infrastructure
-##
-## This file is distributed under the University of Illinois Open Source
-## License. See LICENSE.TXT for details.
-##
-##===----------------------------------------------------------------------===##
-##
-## This file describes the PTX register file generator.
-##
-##===----------------------------------------------------------------------===##
-
-from sys import argv, exit, stdout
-
-
-if len(argv) != 5:
-    print('Usage: generate-register-td.py <num_preds> <num_16> <num_32> <num_64>')
-    exit(1)
-
-try:
-    num_pred  = int(argv[1])
-    num_16bit = int(argv[2])
-    num_32bit = int(argv[3])
-    num_64bit = int(argv[4])
-except:
-    print('ERROR: Invalid integer parameter')
-    exit(1)
-
-## Print the register definition file
-td_file = open('PTXRegisterInfo.td', 'w')
-
-td_file.write('''
-//===- PTXRegisterInfo.td - PTX Register defs ----------------*- tblgen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Declarations that describe the PTX register file
-//===----------------------------------------------------------------------===//
-
-class PTXReg<string n> : Register<n> {
-  let Namespace = "PTX";
-}
-
-//===----------------------------------------------------------------------===//
-//  Registers
-//===----------------------------------------------------------------------===//
-''')
-
-
-# Print predicate registers
-td_file.write('\n///===- Predicate Registers -----------------------------------------------===//\n\n')
-for r in range(0, num_pred):
-    td_file.write('def P%d : PTXReg<"p%d">;\n' % (r, r))
-
-# Print 16-bit registers
-td_file.write('\n///===- 16-Bit Registers --------------------------------------------------===//\n\n')
-for r in range(0, num_16bit):
-    td_file.write('def RH%d : PTXReg<"rh%d">;\n' % (r, r))
-
-# Print 32-bit registers
-td_file.write('\n///===- 32-Bit Registers --------------------------------------------------===//\n\n')
-for r in range(0, num_32bit):
-    td_file.write('def R%d : PTXReg<"r%d">;\n' % (r, r))
-
-# Print 64-bit registers
-td_file.write('\n///===- 64-Bit Registers --------------------------------------------------===//\n\n')
-for r in range(0, num_64bit):
-    td_file.write('def RD%d : PTXReg<"rd%d">;\n' % (r, r))
-
-
-td_file.write('''
-//===----------------------------------------------------------------------===//
-//  Register classes
-//===----------------------------------------------------------------------===//
-''')
-
-
-# Print register classes
-
-td_file.write('def RegPred : RegisterClass<"PTX", [i1], 8, (sequence "P%%u", 0, %d)>;\n' % (num_pred-1))
-td_file.write('def RegI16 : RegisterClass<"PTX", [i16], 16, (sequence "RH%%u", 0, %d)>;\n' % (num_16bit-1))
-td_file.write('def RegI32 : RegisterClass<"PTX", [i32], 32, (sequence "R%%u", 0, %d)>;\n' % (num_32bit-1))
-td_file.write('def RegI64 : RegisterClass<"PTX", [i64], 64, (sequence "RD%%u", 0, %d)>;\n' % (num_64bit-1))
-td_file.write('def RegF32 : RegisterClass<"PTX", [f32], 32, (sequence "R%%u", 0, %d)>;\n' % (num_32bit-1))
-td_file.write('def RegF64 : RegisterClass<"PTX", [f64], 64, (sequence "RD%%u", 0, %d)>;\n' % (num_64bit-1))
-
-
-td_file.close()
-
-## Now write the PTXCallingConv.td file
-td_file = open('PTXCallingConv.td', 'w')
-
-# Reserve 10% of the available registers for return values, and the other 90%
-# for parameters
-num_ret_pred    = int(0.1 * num_pred)
-num_ret_16bit   = int(0.1 * num_16bit)
-num_ret_32bit   = int(0.1 * num_32bit)
-num_ret_64bit   = int(0.1 * num_64bit)
-num_param_pred  = num_pred - num_ret_pred
-num_param_16bit = num_16bit - num_ret_16bit
-num_param_32bit = num_32bit - num_ret_32bit
-num_param_64bit = num_64bit - num_ret_64bit
-
-param_regs_pred  = [('P%d' % (i+num_ret_pred)) for i in range(0, num_param_pred)]
-ret_regs_pred    = ['P%d' % i for i in range(0, num_ret_pred)]
-param_regs_16bit = [('RH%d' % (i+num_ret_16bit)) for i in range(0, num_param_16bit)]
-ret_regs_16bit   = ['RH%d' % i for i in range(0, num_ret_16bit)]
-param_regs_32bit = [('R%d' % (i+num_ret_32bit)) for i in range(0, num_param_32bit)]
-ret_regs_32bit   = ['R%d' % i for i in range(0, num_ret_32bit)]
-param_regs_64bit = [('RD%d' % (i+num_ret_64bit)) for i in range(0, num_param_64bit)]
-ret_regs_64bit   = ['RD%d' % i for i in range(0, num_ret_64bit)]
-
-param_list_pred  = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_pred)
-ret_list_pred    = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_pred)
-param_list_16bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_16bit)
-ret_list_16bit   = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_16bit)
-param_list_32bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_32bit)
-ret_list_32bit   = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_32bit)
-param_list_64bit = reduce(lambda x, y: '%s, %s' % (x, y), param_regs_64bit)
-ret_list_64bit   = reduce(lambda x, y: '%s, %s' % (x, y), ret_regs_64bit)
-
-td_file.write('''
-//===--- PTXCallingConv.td - Calling Conventions -----------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the PTX architecture.
-//
-//===----------------------------------------------------------------------===//
-
-// PTX Formal Parameter Calling Convention
-def CC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[%s]>>,
-  CCIfType<[i16],     CCAssignToReg<[%s]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[%s]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[%s]>>
-]>;
-
-// PTX Return Value Calling Convention
-def RetCC_PTX : CallingConv<[
-  CCIfType<[i1],      CCAssignToReg<[%s]>>,
-  CCIfType<[i16],     CCAssignToReg<[%s]>>,
-  CCIfType<[i32,f32], CCAssignToReg<[%s]>>,
-  CCIfType<[i64,f64], CCAssignToReg<[%s]>>
-]>;
-''' % (param_list_pred, param_list_16bit, param_list_32bit, param_list_64bit,
-       ret_list_pred, ret_list_16bit, ret_list_32bit, ret_list_64bit))
-
-
-td_file.close()
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index d1dda37..73b4aba 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -1,16 +1,16 @@
 set(LLVM_TARGET_DEFINITIONS PPC.td)
 
-tablegen(PPCGenAsmWriter.inc -gen-asm-writer)
-tablegen(PPCGenCodeEmitter.inc -gen-emitter)
-tablegen(PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(PPCGenRegisterInfo.inc -gen-register-info)
-tablegen(PPCGenInstrInfo.inc -gen-instr-info)
-tablegen(PPCGenDAGISel.inc -gen-dag-isel)
-tablegen(PPCGenCallingConv.inc -gen-callingconv)
-tablegen(PPCGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(PPCGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(PPCGenCodeEmitter.inc -gen-emitter)
+llvm_tablegen(PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+llvm_tablegen(PPCGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(PPCGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(PPCGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(PPCGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(PPCGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(PowerPCCommonTableGen)
 
 add_llvm_target(PowerPCCodeGen
-  PPCAsmBackend.cpp
   PPCAsmPrinter.cpp
   PPCBranchSelector.cpp
   PPCCodeEmitter.cpp
@@ -20,15 +20,27 @@ add_llvm_target(PowerPCCodeGen
   PPCISelLowering.cpp
   PPCFrameLowering.cpp
   PPCJITInfo.cpp
-  PPCMCCodeEmitter.cpp
   PPCMCInstLower.cpp
-  PPCPredicates.cpp
   PPCRegisterInfo.cpp
   PPCSubtarget.cpp
   PPCTargetMachine.cpp
   PPCSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMPowerPCCodeGen
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMPowerPCAsmPrinter
+  LLVMPowerPCDesc
+  LLVMPowerPCInfo
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
index 389ea77..1d857e2 100644
--- a/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
+++ b/lib/Target/PowerPC/InstPrinter/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMPowerPCAsmPrinter
   PPCInstPrinter.cpp
   )
-add_dependencies(LLVMPowerPCAsmPrinter PowerPCCodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMPowerPCAsmPrinter
+  LLVMMC
+  LLVMSupport
+  )
+
+add_dependencies(LLVMPowerPCAsmPrinter PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 1a9bd76..b6a0835 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -13,7 +13,8 @@
 
 #define DEBUG_TYPE "asm-printer"
 #include "PPCInstPrinter.h"
-#include "PPCPredicates.h"
+#include "MCTargetDesc/PPCBaseInfo.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/raw_ostream.h"
@@ -30,7 +31,8 @@ void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
-void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
+void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                               StringRef Annot) {
   // Check for slwi/srwi mnemonics.
   if (MI->getOpcode() == PPC::RLWINM) {
     unsigned char SH = MI->getOperand(2).getImm();
@@ -49,6 +51,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ", ";
       printOperand(MI, 1, O);
       O << ", " << (unsigned int)SH;
+
+      printAnnotation(O, Annot);
       return;
     }
   }
@@ -59,6 +63,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
     printOperand(MI, 0, O);
     O << ", ";
     printOperand(MI, 1, O);
+    printAnnotation(O, Annot);
     return;
   }
   
@@ -72,11 +77,13 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O) {
       O << ", ";
       printOperand(MI, 1, O);
       O << ", " << (unsigned int)SH;
+      printAnnotation(O, Annot);
       return;
     }
   }
   
   printInstruction(MI, O);
+  printAnnotation(O, Annot);
 }
 
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index d022a44..4ed4b76 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -32,7 +32,7 @@ public:
   }
   
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O);
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
   static const char *getInstructionName(unsigned Opcode);
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index a1b8166..c4041db 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -1,4 +1,16 @@
 add_llvm_library(LLVMPowerPCDesc
+  PPCAsmBackend.cpp
   PPCMCTargetDesc.cpp
   PPCMCAsmInfo.cpp
+  PPCMCCodeEmitter.cpp
+  PPCPredicates.cpp
   )
+
+add_llvm_library_dependencies(LLVMPowerPCDesc
+  LLVMMC
+  LLVMPowerPCAsmPrinter
+  LLVMPowerPCInfo
+  LLVMSupport
+  )
+
+add_dependencies(LLVMPowerPCDesc PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 4b8cbb7..9f2fd6d 100644
--- a/lib/Target/PowerPC/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -7,17 +7,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmBackend.h"
-#include "PPC.h"
-#include "PPCFixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Object/MachOFormat.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
+static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+    return Value;
+  case PPC::fixup_ppc_brcond14:
+    return Value & 0x3ffc;
+  case PPC::fixup_ppc_br24:
+    return Value & 0x3fffffc;
+#if 0
+  case PPC::fixup_ppc_hi16:
+    return (Value >> 16) & 0xffff;
+#endif
+  case PPC::fixup_ppc_ha16:
+    return ((Value >> 16) + ((Value & 0x8000) ? 1 : 0)) & 0xffff;
+  case PPC::fixup_ppc_lo16:
+    return Value & 0xffff;
+  }
+}
+
 namespace {
 class PPCMachObjectWriter : public MCMachObjectTargetWriter {
 public:
@@ -31,10 +57,17 @@ public:
                         MCValue Target, uint64_t &FixedValue) {}
 };
 
-class PPCAsmBackend : public TargetAsmBackend {
+class PPCELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  PPCELFObjectWriter(bool Is64Bit, Triple::OSType OSType, uint16_t EMachine,
+                     bool HasRelocationAddend, bool isLittleEndian)
+    : MCELFObjectTargetWriter(Is64Bit, OSType, EMachine, HasRelocationAddend) {}
+};
+
+class PPCAsmBackend : public MCAsmBackend {
 const Target &TheTarget;
 public:
-  PPCAsmBackend(const Target &T) : TargetAsmBackend(), TheTarget(T) {}
+  PPCAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
 
   unsigned getNumFixupKinds() const { return PPC::NumTargetFixupKinds; }
 
@@ -49,7 +82,7 @@ public:
     };
   
     if (Kind < FirstTargetFixupKind)
-      return TargetAsmBackend::getFixupKindInfo(Kind);
+      return MCAsmBackend::getFixupKindInfo(Kind);
   
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
@@ -109,15 +142,50 @@ namespace {
       return false;
     }
   };
+
+  class ELFPPCAsmBackend : public PPCAsmBackend {
+    Triple::OSType OSType;
+  public:
+    ELFPPCAsmBackend(const Target &T, Triple::OSType OSType) :
+      PPCAsmBackend(T), OSType(OSType) { }
+    
+    void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                    uint64_t Value) const {
+      Value = adjustFixupValue(Fixup.getKind(), Value);
+      if (!Value) return;           // Doesn't change encoding.
+
+      unsigned Offset = Fixup.getOffset();
+
+      // For each byte of the fragment that the fixup touches, mask in the bits from
+      // the fixup value. The Value has been "split up" into the appropriate
+      // bitfields above.
+      for (unsigned i = 0; i != 4; ++i)
+        Data[Offset + i] |= uint8_t((Value >> ((4 - i - 1)*8)) & 0xff);
+    }
+    
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+      bool is64 = getPointerSize() == 8;
+      return createELFObjectWriter(new PPCELFObjectWriter(
+                                      /*Is64Bit=*/is64,
+                                      OSType,
+                                      is64 ? ELF::EM_PPC64 : ELF::EM_PPC,                                      
+                                      /*addend*/ true, /*isLittleEndian*/ false),
+                                   OS, /*IsLittleEndian=*/false);
+    }
+    
+    virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
+      return false;
+    }
+  };
+
 } // end anonymous namespace
 
 
 
 
-TargetAsmBackend *llvm::createPPCAsmBackend(const Target &T,
-                                            const std::string &TT) {
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT) {
   if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
 
-  return 0;
+  return new ELFPPCAsmBackend(T, Triple(TT).getOS());
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h
new file mode 100644
index 0000000..369bbdc
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCBaseInfo.h
@@ -0,0 +1,70 @@
+//===-- PPCBaseInfo.h - Top level definitions for PPC -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the PPC target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCBASEINFO_H
+#define PPCBASEINFO_H
+
+#include "PPCMCTargetDesc.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+/// getPPCRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+inline static unsigned getPPCRegisterNumbering(unsigned RegEnum) {
+  using namespace PPC;
+  switch (RegEnum) {
+  case 0: return 0;
+  case R0 :  case X0 :  case F0 :  case V0 : case CR0:  case CR0LT: return  0;
+  case R1 :  case X1 :  case F1 :  case V1 : case CR1:  case CR0GT: return  1;
+  case R2 :  case X2 :  case F2 :  case V2 : case CR2:  case CR0EQ: return  2;
+  case R3 :  case X3 :  case F3 :  case V3 : case CR3:  case CR0UN: return  3;
+  case R4 :  case X4 :  case F4 :  case V4 : case CR4:  case CR1LT: return  4;
+  case R5 :  case X5 :  case F5 :  case V5 : case CR5:  case CR1GT: return  5;
+  case R6 :  case X6 :  case F6 :  case V6 : case CR6:  case CR1EQ: return  6;
+  case R7 :  case X7 :  case F7 :  case V7 : case CR7:  case CR1UN: return  7;
+  case R8 :  case X8 :  case F8 :  case V8 : case CR2LT: return  8;
+  case R9 :  case X9 :  case F9 :  case V9 : case CR2GT: return  9;
+  case R10:  case X10:  case F10:  case V10: case CR2EQ: return 10;
+  case R11:  case X11:  case F11:  case V11: case CR2UN: return 11;
+  case R12:  case X12:  case F12:  case V12: case CR3LT: return 12;
+  case R13:  case X13:  case F13:  case V13: case CR3GT: return 13;
+  case R14:  case X14:  case F14:  case V14: case CR3EQ: return 14;
+  case R15:  case X15:  case F15:  case V15: case CR3UN: return 15;
+  case R16:  case X16:  case F16:  case V16: case CR4LT: return 16;
+  case R17:  case X17:  case F17:  case V17: case CR4GT: return 17;
+  case R18:  case X18:  case F18:  case V18: case CR4EQ: return 18;
+  case R19:  case X19:  case F19:  case V19: case CR4UN: return 19;
+  case R20:  case X20:  case F20:  case V20: case CR5LT: return 20;
+  case R21:  case X21:  case F21:  case V21: case CR5GT: return 21;
+  case R22:  case X22:  case F22:  case V22: case CR5EQ: return 22;
+  case R23:  case X23:  case F23:  case V23: case CR5UN: return 23;
+  case R24:  case X24:  case F24:  case V24: case CR6LT: return 24;
+  case R25:  case X25:  case F25:  case V25: case CR6GT: return 25;
+  case R26:  case X26:  case F26:  case V26: case CR6EQ: return 26;
+  case R27:  case X27:  case F27:  case V27: case CR6UN: return 27;
+  case R28:  case X28:  case F28:  case V28: case CR7LT: return 28;
+  case R29:  case X29:  case F29:  case V29: case CR7GT: return 29;
+  case R30:  case X30:  case F30:  case V30: case CR7EQ: return 30;
+  case R31:  case X31:  case F31:  case V31: case CR7UN: return 31;
+  default:
+    llvm_unreachable("Unhandled reg in PPCRegisterInfo::getRegisterNumbering!");
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/PowerPC/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index b3c889e..b3c889e 100644
--- a/lib/Target/PowerPC/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index b6dca83..e9424d8 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -31,6 +31,10 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
 }
 
 PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
+  if (is64Bit)
+    PointerSize = 8;
+  IsLittleEndian = false;
+
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
@@ -56,7 +60,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
-  HasLCOMMDirective = true;
+  LCOMMDirectiveType = LCOMM::NoAlignment;
   AssemblerDialect = 0;           // Old-Style mnemonics.
 }
 
diff --git a/lib/Target/PowerPC/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index cf73d86..262f97c3 100644
--- a/lib/Target/PowerPC/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -12,9 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mccodeemitter"
-#include "PPC.h"
-#include "PPCRegisterInfo.h"
-#include "PPCFixupKinds.h"
+#include "MCTargetDesc/PPCBaseInfo.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/ADT/Statistic.h"
@@ -170,7 +169,7 @@ get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
   const MCOperand &MO = MI.getOperand(OpNo);
   assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
          (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
-  return 0x80 >> PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+  return 0x80 >> getPPCRegisterNumbering(MO.getReg());
 }
 
 
@@ -182,7 +181,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
     // The GPR operand should come through here though.
     assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
            MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
-    return PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+    return getPPCRegisterNumbering(MO.getReg());
   }
   
   assert(MO.isImm() &&
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 02b887f..d5c8a9e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -13,10 +13,14 @@
 
 #include "PPCMCTargetDesc.h"
 #include "PPCMCAsmInfo.h"
+#include "InstPrinter/PPCInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "PPCGenInstrInfo.inc"
@@ -35,11 +39,16 @@ static MCInstrInfo *createPPCMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializePowerPCMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo);
-}
+static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64);
+  unsigned Flavour = isPPC64 ? 0 : 1;
+  unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
 
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitPPCMCRegisterInfo(X, RA, Flavour, Flavour);
+  return X;
+}
 
 static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
                                                  StringRef FS) {
@@ -48,23 +57,95 @@ static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializePowerPCMCSubtargetInfo() {
+static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
+
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new PPCMCAsmInfoDarwin(isPPC64);
+  else
+    MAI = new PPCLinuxMCAsmInfo(isPPC64);
+
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(PPC::R1, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  if (RM == Reloc::Default) {
+    Triple T(TT);
+    if (T.isOSDarwin())
+      RM = Reloc::DynamicNoPIC;
+    else
+      RM = Reloc::Static;
+  }
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+// This is duplicated code. Refactor this.
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  if (Triple(TT).isOSDarwin())
+    return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
+
+  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  return new PPCInstPrinter(MAI, SyntaxVariant);
+}
+
+extern "C" void LLVMInitializePowerPCTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn C(ThePPC32Target, createPPCMCAsmInfo);
+  RegisterMCAsmInfoFn D(ThePPC64Target, createPPCMCAsmInfo);  
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC32Target, createPPCMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC64Target, createPPCMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(ThePPC32Target, createPPCMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePPC64Target, createPPCMCRegisterInfo);
+
+  // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(ThePPC32Target,
                                           createPPCMCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(ThePPC64Target,
                                           createPPCMCSubtargetInfo);
-}
 
-static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
-  Triple TheTriple(TT);
-  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
-  if (TheTriple.isOSDarwin())
-    return new PPCMCAsmInfoDarwin(isPPC64);
-  return new PPCLinuxMCAsmInfo(isPPC64);
+  // Register the MC Code Emitter
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter);
   
-}
+    // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(ThePPC32Target, createPPCAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(ThePPC64Target, createPPCAsmBackend);
+  
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC32Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer);
 
-extern "C" void LLVMInitializePowerPCMCAsmInfo() {
-  RegisterMCAsmInfoFn C(ThePPC32Target, createMCAsmInfo);
-  RegisterMCAsmInfoFn D(ThePPC64Target, createMCAsmInfo);  
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index cee2350..e5bf2a9 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -15,6 +15,10 @@
 #define PPCMCTARGETDESC_H
 
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
@@ -22,6 +26,12 @@ class StringRef;
 extern Target ThePPC32Target;
 extern Target ThePPC64Target;
   
+MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT);
+  
 } // End llvm namespace
 
 // Defines symbolic names for PowerPC registers.  This defines a mapping from
diff --git a/lib/Target/PowerPC/PPCPredicates.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
index 12bb0a1..12bb0a1 100644
--- a/lib/Target/PowerPC/PPCPredicates.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
diff --git a/lib/Target/PowerPC/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index b2c8315..f872e86 100644
--- a/lib/Target/PowerPC/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
 #define LLVM_TARGET_POWERPC_PPCPREDICATES_H
 
-#include "PPC.h"
-
 namespace llvm {
 namespace PPC {
   /// Predicate - These are "(BI << 5) | BO"  for various predicates.
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index 7191dd1..5dc1863 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_TARGET_POWERPC_H
 #define LLVM_TARGET_POWERPC_H
 
+#include "MCTargetDesc/PPCBaseInfo.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include <string>
 
@@ -30,22 +31,12 @@ namespace llvm {
   class MachineInstr;
   class AsmPrinter;
   class MCInst;
-  class MCCodeEmitter;
-  class MCContext;
-  class MCInstrInfo;
-  class MCSubtargetInfo;
   class TargetMachine;
-  class TargetAsmBackend;
   
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
   FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
                                             JITCodeEmitter &MCE);
-  MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCSubtargetInfo &STI,
-                                        MCContext &Ctx);
-  TargetAsmBackend *createPPCAsmBackend(const Target &, const std::string &);
-  
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP, bool isDarwin);
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index aabf494..2d5d302 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -43,9 +43,9 @@ def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
 def FeatureGPUL      : SubtargetFeature<"gpul","IsGigaProcessor", "true",
                                         "Enable GPUL instructions">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
-                                        "Enable the fsqrt instruction">; 
+                                        "Enable the fsqrt instruction">;
 def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
-                                        "Enable the stfiwx instruction">; 
+                                        "Enable the stfiwx instruction">;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9de2200..9528459 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -18,9 +18,9 @@
 
 #define DEBUG_TYPE "asmprinter"
 #include "PPC.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
 #include "PPCSubtarget.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -43,11 +43,11 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSet.h"
@@ -679,18 +679,8 @@ static AsmPrinter *createPPCAsmPrinterPass(TargetMachine &tm,
   return new PPCLinuxAsmPrinter(tm, Streamer);
 }
 
-static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI) {
-  return new PPCInstPrinter(MAI, SyntaxVariant);
-}
-
-
 // Force static initialization.
 extern "C" void LLVMInitializePowerPCAsmPrinter() { 
   TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
   TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
-  
-  TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
 }
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index e161d23..475edf3 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -19,7 +19,7 @@
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
-#include "PPCPredicates.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 42232a0..4a1f182 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -140,7 +140,7 @@ unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI,
   const MachineOperand &MO = MI.getOperand(OpNo);
   assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
          (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
-  return 0x80 >> PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+  return 0x80 >> getPPCRegisterNumbering(MO.getReg());
 }
 
 MachineRelocation PPCCodeEmitter::GetRelocation(const MachineOperand &MO, 
@@ -250,7 +250,7 @@ unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
     // The GPR operand should come through here though.
     assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
            MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
-    return PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+    return getPPCRegisterNumbering(MO.getReg());
   }
   
   assert(MO.isImm() &&
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index 375e000..7dead10 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -109,14 +109,14 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   for (MachineRegisterInfo::livein_iterator
        I = MF->getRegInfo().livein_begin(),
        E = MF->getRegInfo().livein_end(); I != E; ++I) {
-    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(I->first);
+    unsigned RegNo = getPPCRegisterNumbering(I->first);
     if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
       UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
   }
   for (MachineRegisterInfo::liveout_iterator
        I = MF->getRegInfo().liveout_begin(),
        E = MF->getRegInfo().liveout_end(); I != E; ++I) {
-    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(*I);
+    unsigned RegNo = getPPCRegisterNumbering(*I);
     if (VRRegNo[RegNo] == *I)              // If this really is a vector reg.
       UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
   }
@@ -712,13 +712,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void PPCFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
-  // Initial state of the frame pointer is R1.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(PPC::R1, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 static bool spillsCR(const MachineFunction &MF) {
   const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   return FuncInfo->isCRSpilled();
@@ -885,7 +878,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
     }
 
-    LowerBound -= (31 - PPCRegisterInfo::getRegisterNumbering(MinFPR) + 1) * 8;
+    LowerBound -= (31 - getPPCRegisterNumbering(MinFPR) + 1) * 8;
   }
 
   // Check whether the frame pointer register is allocated. If so, make sure it
@@ -919,8 +912,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     }
 
     unsigned MinReg =
-      std::min<unsigned>(PPCRegisterInfo::getRegisterNumbering(MinGPR),
-                         PPCRegisterInfo::getRegisterNumbering(MinG8R));
+      std::min<unsigned>(getPPCRegisterNumbering(MinGPR),
+                         getPPCRegisterNumbering(MinG8R));
 
     if (Subtarget.isPPC64()) {
       LowerBound -= (31 - MinReg + 1) * 8;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 0c18de1..20faa71 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -40,7 +40,6 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
   bool needsFP(const MachineFunction &MF) const;
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2176c02..6f204cc 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -14,8 +14,8 @@
 
 #define DEBUG_TYPE "ppc-codegen"
 #include "PPC.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 9741a39..d6b8a9e 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14,8 +14,8 @@
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -211,7 +211,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // TRAMPOLINE is custom lowered.
-  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -365,7 +366,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
   }
 
+  setOperationAction(ISD::ATOMIC_LOAD,  MVT::i32, Expand);
+  setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
+
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
     setStackPointerRegisterToSaveRestore(PPC::X1);
@@ -401,12 +406,14 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   if (PPCSubTarget.isDarwin())
     setPrefFunctionAlignment(4);
 
+  setInsertFencesForAtomic(true);
+
   computeRegisterProperties();
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
-unsigned PPCTargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
   const TargetMachine &TM = getTargetMachine();
   // Darwin passes everything on 4 byte boundary.
   if (TM.getSubtarget<PPCSubtarget>().isDarwin())
@@ -463,7 +470,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const {
+EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
   return MVT::i32;
 }
 
@@ -1368,8 +1375,13 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
   return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), false, false, 0);
 }
 
-SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
-                                           SelectionDAG &DAG) const {
+SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
@@ -1378,7 +1390,7 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
-  const Type *IntPtrTy =
+  Type *IntPtrTy =
     DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType(
                                                              *DAG.getContext());
 
@@ -1398,16 +1410,13 @@ SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op,
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   std::pair<SDValue, SDValue> CallResult =
-    LowerCallTo(Chain, Op.getValueType().getTypeForEVT(*DAG.getContext()),
+    LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
                 false, false, false, false, 0, CallingConv::C, false,
                 /*isReturnValueUsed=*/true,
                 DAG.getExternalSymbol("__trampoline_setup", PtrVT),
                 Args, DAG, dl);
 
-  SDValue Ops[] =
-    { CallResult.first, CallResult.second };
-
-  return DAG.getMergeValues(Ops, 2, dl);
+  return CallResult.second;
 }
 
 SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
@@ -2550,7 +2559,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
       unsigned OpFlags = 0;
       if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-          (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+          (PPCSubTarget.getTargetTriple().isMacOSX() &&
            PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
           (G->getGlobal()->isDeclaration() ||
            G->getGlobal()->isWeakForLinker())) {
@@ -2574,7 +2583,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     unsigned char OpFlags = 0;
 
     if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
-        (!PPCSubTarget.getTargetTriple().isMacOSX() ||
+        (PPCSubTarget.getTargetTriple().isMacOSX() &&
          PPCSubTarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
@@ -2941,6 +2950,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
   SmallVector<SDValue, 8> MemOpChains;
 
+  bool seenFloatArg = false;
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, j = 0, e = ArgLocs.size();
        i != e;
@@ -2985,6 +2995,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     }
 
     if (VA.isRegLoc()) {
+      seenFloatArg |= VA.getLocVT().isFloatingPoint();
       // Put argument in a physical register.
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
@@ -3011,9 +3022,11 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Set CR6 to true if this is a vararg call.
+  // Set CR6 to true if this is a vararg call with floating args passed in
+  // registers.
   if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0);
+    SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET,
+                                     dl, MVT::i32), 0);
     RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
   }
 
@@ -3403,6 +3416,17 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                     Ins, InVals);
 }
 
+bool
+PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                  MachineFunction &MF, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+}
+
 SDValue
 PPCTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
@@ -4490,7 +4514,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GlobalTLSAddress:   llvm_unreachable("TLS not implemented for PPC");
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
-  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG, PPCSubTarget);
 
@@ -5504,7 +5529,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
@@ -5634,7 +5659,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
 bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   // FIXME: PPC does not allow r+i addressing modes for vectors!
 
   // PPC allows a sign-extended 16-bit immediate field.
@@ -5670,7 +5695,7 @@ bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 /// isLegalAddressImmediate - Return true if the integer value can be used
 /// as the offset of the target addressing mode for load / store of the
 /// given type.
-bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,const Type *Ty) const{
+bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{
   // PPC allows a sign-extended 16-bit immediate field.
   return (V > -(1 << 16) && V < (1 << 16)-1);
 }
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 986b4e7..430e45e 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -246,7 +246,7 @@ namespace llvm {
     virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
@@ -323,7 +323,7 @@ namespace llvm {
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
     /// alignment, not its logarithm.
-    unsigned getByValTypeAlignment(const Type *Ty) const;
+    unsigned getByValTypeAlignment(Type *Ty) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
@@ -334,12 +334,12 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
     /// isLegalAddressImmediate - Return true if the integer value can be used
     /// as the offset of the target addressing mode for load / store of the
     /// given type.
-    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
 
     /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
     /// the offset of the target addressing mode.
@@ -390,7 +390,8 @@ namespace llvm {
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
                          const PPCSubtarget &Subtarget) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG,
@@ -444,6 +445,12 @@ namespace llvm {
                 DebugLoc dl, SelectionDAG &DAG,
                 SmallVectorImpl<SDValue> &InVals) const;
 
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                   bool isVarArg,
+                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                   LLVMContext &Context) const;
+
     virtual SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 143444f..2bc109c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -15,18 +15,18 @@
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
 #include "PPCMachineFunctionInfo.h"
-#include "PPCPredicates.h"
 #include "PPCTargetMachine.h"
 #include "PPCHazardRecognizers.h"
+#include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/STLExtras.h"
 
@@ -334,7 +334,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                   const TargetRegisterClass *RC,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const{
   DebugLoc DL;
-  if (RC == PPC::GPRCRegisterClass) {
+  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
                                          .addReg(SrcReg,
@@ -350,7 +350,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (RC == PPC::G8RCRegisterClass) {
+  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
     if (SrcReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
                                          .addReg(SrcReg,
@@ -366,17 +366,17 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (RC == PPC::F8RCRegisterClass) {
+  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (RC == PPC::F4RCRegisterClass) {
+  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
                                        .addReg(SrcReg,
                                                getKillRegState(isKill)),
                                        FrameIdx));
-  } else if (RC == PPC::CRRCRegisterClass) {
+  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
     if ((EnablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
         (EnablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
       // FIXME (64-bit): Enable
@@ -402,7 +402,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
       // If the saved register wasn't CR0, shift the bits left so that they are
       // in CR0's slot.
       if (SrcReg != PPC::CR0) {
-        unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(SrcReg)*4;
+        unsigned ShiftBits = getPPCRegisterNumbering(SrcReg)*4;
         // rlwinm scratch, scratch, ShiftBits, 0, 31.
         NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
                        .addReg(ScratchReg).addImm(ShiftBits)
@@ -414,7 +414,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
                                                  getKillRegState(isKill)),
                                          FrameIdx));
     }
-  } else if (RC == PPC::CRBITRCRegisterClass) {
+  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
     // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
     // backend currently only uses CR1EQ as an individual bit, this should
     // not cause any bug. If we need other uses of CR bits, the following
@@ -448,7 +448,7 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
     return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx,
                                PPC::CRRCRegisterClass, NewMIs);
 
-  } else if (RC == PPC::VRRCRegisterClass) {
+  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // STVX VAL, 0, R0
@@ -499,7 +499,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                    unsigned DestReg, int FrameIdx,
                                    const TargetRegisterClass *RC,
                                    SmallVectorImpl<MachineInstr*> &NewMIs)const{
-  if (RC == PPC::GPRCRegisterClass) {
+  if (PPC::GPRCRegisterClass->hasSubClassEq(RC)) {
     if (DestReg != PPC::LR) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
                                                  DestReg), FrameIdx));
@@ -508,7 +508,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::R11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR)).addReg(PPC::R11));
     }
-  } else if (RC == PPC::G8RCRegisterClass) {
+  } else if (PPC::G8RCRegisterClass->hasSubClassEq(RC)) {
     if (DestReg != PPC::LR8) {
       NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
                                          FrameIdx));
@@ -517,13 +517,13 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
                                                  PPC::R11), FrameIdx));
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::R11));
     }
-  } else if (RC == PPC::F8RCRegisterClass) {
+  } else if (PPC::F8RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
                                        FrameIdx));
-  } else if (RC == PPC::F4RCRegisterClass) {
+  } else if (PPC::F4RCRegisterClass->hasSubClassEq(RC)) {
     NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
                                        FrameIdx));
-  } else if (RC == PPC::CRRCRegisterClass) {
+  } else if (PPC::CRRCRegisterClass->hasSubClassEq(RC)) {
     // FIXME: We need a scatch reg here.  The trouble with using R0 is that
     // it's possible for the stack frame to be so big the save location is
     // out of range of immediate offsets, necessitating another register.
@@ -537,7 +537,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     // If the reloaded register isn't CR0, shift the bits right so that they are
     // in the right CR's slot.
     if (DestReg != PPC::CR0) {
-      unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(DestReg)*4;
+      unsigned ShiftBits = getPPCRegisterNumbering(DestReg)*4;
       // rlwinm r11, r11, 32-ShiftBits, 0, 31.
       NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), ScratchReg)
                     .addReg(ScratchReg).addImm(32-ShiftBits).addImm(0)
@@ -546,7 +546,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
 
     NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTCRF), DestReg)
                      .addReg(ScratchReg));
-  } else if (RC == PPC::CRBITRCRegisterClass) {
+  } else if (PPC::CRBITRCRegisterClass->hasSubClassEq(RC)) {
 
     unsigned Reg = 0;
     if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT ||
@@ -577,7 +577,7 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx,
                                 PPC::CRRCRegisterClass, NewMIs);
 
-  } else if (RC == PPC::VRRCRegisterClass) {
+  } else if (PPC::VRRCRegisterClass->hasSubClassEq(RC)) {
     // We don't have indexed addressing for vector loads.  Emit:
     // R0 = ADDI FI#
     // Dest = LVX 0, R0
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 773578c..f248b5b 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1053,6 +1053,10 @@ def CRSET  : XLForm_1_ext<19, 289, (outs CRBITRC:$dst), (ins),
               "creqv $dst, $dst, $dst", BrCR,
               []>;
 
+def CRUNSET: XLForm_1_ext<19, 193, (outs CRBITRC:$dst), (ins),
+              "crxor $dst, $dst, $dst", BrCR,
+              []>;
+
 // XFX-Form instructions.  Instructions that deal with SPRs.
 //
 let Uses = [CTR] in {
@@ -1472,5 +1476,7 @@ def : Pat<(membarrier (i32 imm /*ll*/),
                       (i32 imm /*device*/)),
            (SYNC)>;
 
+def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
+
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 9c2428b..2e90b7a 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -68,52 +67,12 @@ PPCRegisterInfo::requiresRegisterScavenging(const MachineFunction &) const {
           (EnablePPC64RS && Subtarget.isPPC64()));
 }
 
-/// getRegisterNumbering - Given the enum value for some register, e.g.
-/// PPC::F14, return the number that it corresponds to (e.g. 14).
-unsigned PPCRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
-  using namespace PPC;
-  switch (RegEnum) {
-  case 0: return 0;
-  case R0 :  case X0 :  case F0 :  case V0 : case CR0:  case CR0LT: return  0;
-  case R1 :  case X1 :  case F1 :  case V1 : case CR1:  case CR0GT: return  1;
-  case R2 :  case X2 :  case F2 :  case V2 : case CR2:  case CR0EQ: return  2;
-  case R3 :  case X3 :  case F3 :  case V3 : case CR3:  case CR0UN: return  3;
-  case R4 :  case X4 :  case F4 :  case V4 : case CR4:  case CR1LT: return  4;
-  case R5 :  case X5 :  case F5 :  case V5 : case CR5:  case CR1GT: return  5;
-  case R6 :  case X6 :  case F6 :  case V6 : case CR6:  case CR1EQ: return  6;
-  case R7 :  case X7 :  case F7 :  case V7 : case CR7:  case CR1UN: return  7;
-  case R8 :  case X8 :  case F8 :  case V8 : case CR2LT: return  8;
-  case R9 :  case X9 :  case F9 :  case V9 : case CR2GT: return  9;
-  case R10:  case X10:  case F10:  case V10: case CR2EQ: return 10;
-  case R11:  case X11:  case F11:  case V11: case CR2UN: return 11;
-  case R12:  case X12:  case F12:  case V12: case CR3LT: return 12;
-  case R13:  case X13:  case F13:  case V13: case CR3GT: return 13;
-  case R14:  case X14:  case F14:  case V14: case CR3EQ: return 14;
-  case R15:  case X15:  case F15:  case V15: case CR3UN: return 15;
-  case R16:  case X16:  case F16:  case V16: case CR4LT: return 16;
-  case R17:  case X17:  case F17:  case V17: case CR4GT: return 17;
-  case R18:  case X18:  case F18:  case V18: case CR4EQ: return 18;
-  case R19:  case X19:  case F19:  case V19: case CR4UN: return 19;
-  case R20:  case X20:  case F20:  case V20: case CR5LT: return 20;
-  case R21:  case X21:  case F21:  case V21: case CR5GT: return 21;
-  case R22:  case X22:  case F22:  case V22: case CR5EQ: return 22;
-  case R23:  case X23:  case F23:  case V23: case CR5UN: return 23;
-  case R24:  case X24:  case F24:  case V24: case CR6LT: return 24;
-  case R25:  case X25:  case F25:  case V25: case CR6GT: return 25;
-  case R26:  case X26:  case F26:  case V26: case CR6EQ: return 26;
-  case R27:  case X27:  case F27:  case V27: case CR6UN: return 27;
-  case R28:  case X28:  case F28:  case V28: case CR7LT: return 28;
-  case R29:  case X29:  case F29:  case V29: case CR7GT: return 29;
-  case R30:  case X30:  case F30:  case V30: case CR7EQ: return 30;
-  case R31:  case X31:  case F31:  case V31: case CR7UN: return 31;
-  default:
-    llvm_unreachable("Unhandled reg in PPCRegisterInfo::getRegisterNumbering!");
-  }
-}
-
 PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
                                  const TargetInstrInfo &tii)
-  : PPCGenRegisterInfo(), Subtarget(ST), TII(tii) {
+  : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
+                       ST.isPPC64() ? 0 : 1,
+                       ST.isPPC64() ? 0 : 1),
+    Subtarget(ST), TII(tii) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -519,7 +478,7 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
     // rlwinm rA, rA, ShiftBits, 0, 31.
     BuildMI(MBB, II, dl, TII.get(PPC::RLWINM), Reg)
       .addReg(Reg, RegState::Kill)
-      .addImm(PPCRegisterInfo::getRegisterNumbering(SrcReg) * 4)
+      .addImm(getPPCRegisterNumbering(SrcReg) * 4)
       .addImm(0)
       .addImm(31);
 
@@ -668,10 +627,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false);
 }
 
-unsigned PPCRegisterInfo::getRARegister() const {
-  return !Subtarget.isPPC64() ? PPC::LR : PPC::LR8;
-}
-
 unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
@@ -688,27 +643,3 @@ unsigned PPCRegisterInfo::getEHExceptionRegister() const {
 unsigned PPCRegisterInfo::getEHHandlerRegister() const {
   return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
 }
-
-/// DWARFFlavour - Flavour of dwarf regnumbers
-///
-namespace DWARFFlavour {
-  enum {
-    PPC64 = 0, PPC32 = 1
-  };
-}
-
-int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  unsigned Flavour = Subtarget.isPPC64() ?
-    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
-
-  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, Flavour);
-}
-
-int PPCRegisterInfo::getLLVMRegNum(unsigned RegNum, bool isEH) const {
-  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
-  unsigned Flavour = Subtarget.isPPC64() ?
-    DWARFFlavour::PPC64 : DWARFFlavour::PPC32;
-
-  return PPCGenRegisterInfo::getLLVMRegNumFull(RegNum, Flavour);
-}
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 33fe5eb..1cc7213 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -33,10 +33,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
 public:
   PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
   
-  /// getRegisterNumbering - Given the enum value for some register, e.g.
-  /// PPC::F14, return the number that it corresponds to (e.g. 14).
-  static unsigned getRegisterNumbering(unsigned RegEnum);
-
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
   virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const;  
@@ -62,15 +58,11 @@ public:
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 5ea9b0f..cf194de 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -15,7 +15,7 @@
 #include "PPC.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include <cstdlib>
 
 #define GET_SUBTARGETINFO_TARGET_DESC
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index e0ea5ad..f5744b8 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -16,76 +16,43 @@
 #include "llvm/PassManager.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-// This is duplicated code. Refactor this.
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  if (Triple(TT).isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
-
-  return NULL;
-}
-
 extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);  
   RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
-  
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter);
-  TargetRegistry::RegisterCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter);
-  
-  
-  // Register the asm backend.
-  TargetRegistry::RegisterAsmBackend(ThePPC32Target, createPPCAsmBackend);
-  TargetRegistry::RegisterAsmBackend(ThePPC64Target, createPPCAsmBackend);
-  
-  // Register the object streamer.
-  TargetRegistry::RegisterObjectStreamer(ThePPC32Target, createMCStreamer);
-  TargetRegistry::RegisterObjectStreamer(ThePPC64Target, createMCStreamer);
 }
 
-
-PPCTargetMachine::PPCTargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS, bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   bool is64Bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS, is64Bit),
     DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
     TLInfo(*this), TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()) {
-
-  if (getRelocationModel() == Reloc::Default) {
-    if (Subtarget.isDarwin())
-      setRelocationModel(Reloc::DynamicNoPIC);
-    else
-      setRelocationModel(Reloc::Static);
-  }
 }
 
 /// Override this for PowerPC.  Tail merging happily breaks up instruction issue
 /// groups, which typically degrades performance.
 bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
 
-PPC32TargetMachine::PPC32TargetMachine(const Target &T, const std::string &TT, 
-                                       const std::string &CPU,
-                                       const std::string &FS) 
-  : PPCTargetMachine(T, TT, CPU, FS, false) {
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT, 
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM) 
+  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, false) {
 }
 
 
-PPC64TargetMachine::PPC64TargetMachine(const Target &T, const std::string &TT, 
-                                       const std::string &CPU, 
-                                       const std::string &FS)
-  : PPCTargetMachine(T, TT, CPU, FS, true) {
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT, 
+                                       StringRef CPU,  StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, true) {
 }
 
 
@@ -110,19 +77,11 @@ bool PPCTargetMachine::addPreEmitPass(PassManagerBase &PM,
 bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
-  // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
   // FIXME: This should be moved to TargetJITInfo!!
-  if (Subtarget.isPPC64()) {
-    // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many
-    // instructions to materialize arbitrary global variable + function +
-    // constant pool addresses.
-    setRelocationModel(Reloc::PIC_);
+  if (Subtarget.isPPC64())
     // Temporary workaround for the inability of PPC64 JIT to handle jump
     // tables.
     DisableJumpTables = true;      
-  } else {
-    setRelocationModel(Reloc::Static);
-  }
   
   // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
   // writing?
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index baf07e3..d06f084 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -40,9 +40,9 @@ class PPCTargetMachine : public LLVMTargetMachine {
   InstrItineraryData  InstrItins;
 
 public:
-  PPCTargetMachine(const Target &T, const std::string &TT,
-                   const std::string &CPU, const std::string &FS,
-                   bool is64Bit);
+  PPCTargetMachine(const Target &T, StringRef TT,
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM, bool is64Bit);
 
   virtual const PPCInstrInfo      *getInstrInfo() const { return &InstrInfo; }
   virtual const PPCFrameLowering  *getFrameLowering() const {
@@ -77,16 +77,18 @@ public:
 ///
 class PPC32TargetMachine : public PPCTargetMachine {
 public:
-  PPC32TargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  PPC32TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 };
 
 /// PPC64TargetMachine - PowerPC 64-bit target machine.
 ///
 class PPC64TargetMachine : public PPCTargetMachine {
 public:
-  PPC64TargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  PPC64TargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
index 058d599..f63111f 100644
--- a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
+++ b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMPowerPCInfo
   PowerPCTargetInfo.cpp
   )
 
-add_dependencies(LLVMPowerPCInfo PowerPCCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMPowerPCInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMPowerPCInfo PowerPCCommonTableGen)
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index ad607d0..5dc8568 100644
--- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "PPC.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::ThePPC32Target, llvm::ThePPC64Target;
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 4cc9534..1f69ffb 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -2348,3 +2348,13 @@ which can do this in a single operation (instruction or libcall).  It is
 probably best to do this in the code generator.
 
 //===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return (x & y) == 0 || x == 0; }
+should fold to (x & y) == 0.
+
+//===---------------------------------------------------------------------===//
+
+unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
+should fold to x > y.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index c77ded4..5b87849 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS Sparc.td)
 
-tablegen(SparcGenRegisterInfo.inc -gen-register-info)
-tablegen(SparcGenInstrInfo.inc -gen-instr-info)
-tablegen(SparcGenAsmWriter.inc -gen-asm-writer)
-tablegen(SparcGenDAGISel.inc -gen-dag-isel)
-tablegen(SparcGenSubtargetInfo.inc -gen-subtarget)
-tablegen(SparcGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(SparcGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(SparcGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(SparcGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(SparcGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(SparcGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(SparcGenCallingConv.inc -gen-callingconv)
+add_public_tablegen_target(SparcCommonTableGen)
 
 add_llvm_target(SparcCodeGen
   DelaySlotFiller.cpp
@@ -21,5 +22,17 @@ add_llvm_target(SparcCodeGen
   SparcSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMSparcCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSparcDesc
+  LLVMSparcInfo
+  LLVMSupport
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
index 1e8c029..d3bdf0b 100644
--- a/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Sparc/MCTargetDesc/CMakeLists.txt
@@ -2,3 +2,11 @@ add_llvm_library(LLVMSparcDesc
   SparcMCTargetDesc.cpp
   SparcMCAsmInfo.cpp
   )
+
+add_llvm_library_dependencies(LLVMSparcDesc
+  LLVMMC
+  LLVMSparcInfo
+  LLVMSupport
+  )
+
+add_dependencies(LLVMSparcDesc SparcCommonTableGen)
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index cb92a2b..cb2a7df 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "SparcMCTargetDesc.h"
 #include "SparcMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "SparcGenInstrInfo.inc"
@@ -35,8 +36,10 @@ static MCInstrInfo *createSparcMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeSparcMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo);
+static MCRegisterInfo *createSparcMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSparcMCRegisterInfo(X, SP::I7);
+  return X;
 }
 
 static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -46,12 +49,31 @@ static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeSparcMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget,
-                                          createSparcMCSubtargetInfo);
+static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeSparcMCAsmInfo() {
+extern "C" void LLVMInitializeSparcTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<SparcELFMCAsmInfo> X(TheSparcTarget);
   RegisterMCAsmInfo<SparcELFMCAsmInfo> Y(TheSparcV9Target);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcTarget,
+                                       createSparcMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheSparcV9Target,
+                                       createSparcMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheSparcTarget, createSparcMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheSparcTarget, createSparcMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSparcTarget,
+                                          createSparcMCSubtargetInfo);
 }
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index edde842..345e1bc 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -22,9 +22,9 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 6f30d3f..d70b163 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -631,8 +631,8 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
   assert(CalleeFn->hasStructRetAttr() &&
          "Callee does not have the StructRet attribute.");
 
-  const PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
-  const Type *ElementTy = Ty->getElementType();
+  PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
+  Type *ElementTy = Ty->getElementType();
   return getTargetData()->getTypeAllocSize(ElementTy);
 }
 
@@ -748,8 +748,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
 
-  // SPARC has no intrinsics for these particular operations.
+  // FIXME: There are instructions available for ATOMIC_FENCE
+  // on SparcV8 and later.
   setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
 
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 4e3ddf8..7a6bf50 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -17,8 +17,8 @@
 #include "SparcSubtarget.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 0acdd2c..8c16251 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Type.h"
@@ -31,7 +30,7 @@ using namespace llvm;
 
 SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
                                      const TargetInstrInfo &tii)
-  : SparcGenRegisterInfo(), Subtarget(st), TII(tii) {
+  : SparcGenRegisterInfo(SP::I7), Subtarget(st), TII(tii) {
 }
 
 const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
@@ -113,10 +112,6 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 void SparcRegisterInfo::
 processFunctionBeforeFrameFinalized(MachineFunction &MF) const {}
 
-unsigned SparcRegisterInfo::getRARegister() const {
-  return SP::I7;
-}
-
 unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return SP::I6;
 }
@@ -130,11 +125,3 @@ unsigned SparcRegisterInfo::getEHHandlerRegister() const {
   llvm_unreachable("What is the exception handler register");
   return 0;
 }
-
-int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return SparcGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int SparcRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return SparcGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index ec9e63a..f845667 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -46,15 +46,11 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index de647e8..6c501cf 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "SparcSubtarget.h"
 #include "Sparc.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index cbe6d87..3d7b4a4 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -13,7 +13,7 @@
 #include "Sparc.h"
 #include "SparcTargetMachine.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeSparcTarget() {
@@ -24,10 +24,11 @@ extern "C" void LLVMInitializeSparcTarget() {
 
 /// SparcTargetMachine ctor - Create an ILP32 architecture model
 ///
-SparcTargetMachine::SparcTargetMachine(const Target &T, const std::string &TT, 
-                                       const std::string &CPU,
-                                       const std::string &FS, bool is64bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, 
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       bool is64bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS, is64bit),
     DataLayout(Subtarget.getDataLayout()),
     TLInfo(*this), TSInfo(*this), InstrInfo(Subtarget),
@@ -51,15 +52,15 @@ bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM,
 }
 
 SparcV8TargetMachine::SparcV8TargetMachine(const Target &T,
-                                           const std::string &TT, 
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : SparcTargetMachine(T, TT, CPU, FS, false) {
+                                           StringRef TT, StringRef CPU,
+                                           StringRef FS, Reloc::Model RM,
+                                           CodeModel::Model CM)
+  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, false) {
 }
 
 SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, 
-                                           const std::string &TT, 
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : SparcTargetMachine(T, TT, CPU, FS, true) {
+                                           StringRef TT,  StringRef CPU,
+                                           StringRef FS, Reloc::Model RM,
+                                           CodeModel::Model CM)
+  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, true) {
 }
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 799fc49..3c907dd 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -33,9 +33,9 @@ class SparcTargetMachine : public LLVMTargetMachine {
   SparcInstrInfo InstrInfo;
   SparcFrameLowering FrameLowering;
 public:
-  SparcTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS,
-                     bool is64bit);
+  SparcTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM, bool is64bit);
 
   virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameLowering  *getFrameLowering() const {
@@ -62,16 +62,18 @@ public:
 ///
 class SparcV8TargetMachine : public SparcTargetMachine {
 public:
-  SparcV8TargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  SparcV8TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 };
 
 /// SparcV9TargetMachine - Sparc 64-bit target machine
 ///
 class SparcV9TargetMachine : public SparcTargetMachine {
 public:
-  SparcV9TargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  SparcV9TargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/TargetInfo/CMakeLists.txt b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
index 870b56a..a076023 100644
--- a/lib/Target/Sparc/TargetInfo/CMakeLists.txt
+++ b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMSparcInfo
   SparcTargetInfo.cpp
   )
 
-add_dependencies(LLVMSparcInfo SparcCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMSparcInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMSparcInfo SparcCommonTableGen)
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 5c06f07..c9d5b7b 100644
--- a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "Sparc.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheSparcTarget;
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index f4bdbd8..7c09c0e 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS SystemZ.td)
 
-tablegen(SystemZGenRegisterInfo.inc -gen-register-info)
-tablegen(SystemZGenInstrInfo.inc -gen-instr-info)
-tablegen(SystemZGenAsmWriter.inc -gen-asm-writer)
-tablegen(SystemZGenDAGISel.inc -gen-dag-isel)
-tablegen(SystemZGenCallingConv.inc -gen-callingconv)
-tablegen(SystemZGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(SystemZGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(SystemZGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(SystemZGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(SystemZGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(SystemZGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(SystemZGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(SystemZCommonTableGen)
 
 add_llvm_target(SystemZCodeGen
   SystemZAsmPrinter.cpp
@@ -19,5 +20,17 @@ add_llvm_target(SystemZCodeGen
   SystemZSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMSystemZCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMSystemZDesc
+  LLVMSystemZInfo
+  LLVMTarget
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
index 2ac9016..822df09 100644
--- a/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/SystemZ/MCTargetDesc/CMakeLists.txt
@@ -3,5 +3,12 @@ add_llvm_library(LLVMSystemZDesc
   SystemZMCAsmInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMSystemZDesc
+  LLVMMC
+  LLVMSystemZInfo
+  )
+
+add_dependencies(LLVMSystemZDesc SystemZCommonTableGen)
+
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 5a826a6..23fb1e0 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "SystemZMCTargetDesc.h"
 #include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "SystemZGenInstrInfo.inc"
@@ -35,9 +36,10 @@ static MCInstrInfo *createSystemZMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeSystemZMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
-                                      createSystemZMCInstrInfo);
+static MCRegisterInfo *createSystemZMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitSystemZMCRegisterInfo(X, 0);
+  return X;
 }
 
 static MCSubtargetInfo *createSystemZMCSubtargetInfo(StringRef TT,
@@ -48,11 +50,32 @@ static MCSubtargetInfo *createSystemZMCSubtargetInfo(StringRef TT,
   return X;
 }
 
-extern "C" void LLVMInitializeSystemZMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheSystemZTarget,
-                                          createSystemZMCSubtargetInfo);
+static MCCodeGenInfo *createSystemZMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                                 CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  if (RM == Reloc::Default)
+    RM = Reloc::Static;
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeSystemZMCAsmInfo() {
+extern "C" void LLVMInitializeSystemZTargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfo<SystemZMCAsmInfo> X(TheSystemZTarget);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheSystemZTarget,
+                                        createSystemZMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget,
+                                      createSystemZMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheSystemZTarget,
+                                    createSystemZMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheSystemZTarget,
+                                          createSystemZMCSubtargetInfo);
 }
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index fd4d8b7..43dcdfc 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -28,10 +28,8 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 871c297..48ca99f 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -81,6 +81,7 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
   setSchedulingPreference(Sched::RegPressure);
 
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
   setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 99e2730..5f3dd80 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -21,8 +21,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "SystemZGenInstrInfo.inc"
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 11a39fc..580d65b 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -478,7 +478,7 @@ def MOV64rmm  : RSYI<0x04EB,
                      "lmg\t{$from, $to, $dst}",
                      []>;
 
-let isReMaterializable = 1, isAsCheapAsAMove = 1,
+let isReMaterializable = 1, neverHasSideEffects = 1, isAsCheapAsAMove = 1,
     Constraints = "$src = $dst" in {
 def MOV64Pr0_even : Pseudo<(outs GR64P:$dst), (ins GR64P:$src),
                            "lhi\t${dst:subreg_even}, 0",
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 59692e8..b1050d4 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 
 SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm,
                                          const SystemZInstrInfo &tii)
-  : SystemZGenRegisterInfo(), TM(tm), TII(tii) {
+  : SystemZGenRegisterInfo(0), TM(tm), TII(tii) {
 }
 
 const unsigned*
@@ -126,11 +126,6 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(i+1).ChangeToImmediate(Offset);
 }
 
-unsigned SystemZRegisterInfo::getRARegister() const {
-  assert(0 && "What is the return address register");
-  return 0;
-}
-
 unsigned
 SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   assert(0 && "What is the frame register");
@@ -146,13 +141,3 @@ unsigned SystemZRegisterInfo::getEHHandlerRegister() const {
   assert(0 && "What is the exception handler register");
   return 0;
 }
-
-int SystemZRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  assert(0 && "What is the dwarf register number");
-  return -1;
-}
-
-int SystemZRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  assert(0 && "What is the dwarf register number");
-  return -1;
-}
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 2e262e1..03935b2 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -48,15 +48,11 @@ struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
-
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index b3ed066..0845510 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -15,7 +15,7 @@
 #include "SystemZ.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 48298cc..e390f06 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -10,7 +10,7 @@
 #include "SystemZTargetMachine.h"
 #include "SystemZ.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeSystemZTarget() {
@@ -21,18 +21,15 @@ extern "C" void LLVMInitializeSystemZTarget() {
 /// SystemZTargetMachine ctor - Create an ILP64 architecture model
 ///
 SystemZTargetMachine::SystemZTargetMachine(const Target &T,
-                                           const std::string &TT,
-                                           const std::string &CPU,
-                                           const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+                                           StringRef TT, StringRef CPU,
+                                           StringRef FS, Reloc::Model RM,
+                                           CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     DataLayout("E-p:64:64:64-i8:8:16-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
                "-f64:64:64-f128:128:128-a0:16:16-n32:64"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
-
-  if (getRelocationModel() == Reloc::Default)
-    setRelocationModel(Reloc::Static);
 }
 
 bool SystemZTargetMachine::addInstSelector(PassManagerBase &PM,
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index e40b556..43dce4b 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -37,8 +37,9 @@ class SystemZTargetMachine : public LLVMTargetMachine {
   SystemZSelectionDAGInfo TSInfo;
   SystemZFrameLowering    FrameLowering;
 public:
-  SystemZTargetMachine(const Target &T, const std::string &TT,
-                       const std::string &CPU, const std::string &FS);
+  SystemZTargetMachine(const Target &T, StringRef TT,
+                       StringRef CPU, StringRef FS,
+                       Reloc::Model RM, CodeModel::Model CM);
 
   virtual const TargetFrameLowering *getFrameLowering() const {
     return &FrameLowering;
diff --git a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
index 743d8d3..3180708 100644
--- a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
+++ b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMSystemZInfo
   SystemZTargetInfo.cpp
   )
 
-add_dependencies(LLVMSystemZInfo SystemZCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMSystemZInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMSystemZInfo SystemZCommonTableGen)
diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
index 8272b11..da99282 100644
--- a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "SystemZ.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheSystemZTarget;
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index a42ce54..a2b83bc 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -17,6 +17,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/LLVMContext.h"
 #include <cstring>
 
@@ -39,6 +40,11 @@ void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
   unwrap(PM)->add(new TargetData(*unwrap(TD)));
 }
 
+void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
+                              LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new TargetLibraryInfo(*unwrap(TLI)));
+}
+
 char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
   std::string StringRep = unwrap(TD)->getStringRepresentation();
   return strdup(StringRep.c_str());
@@ -87,13 +93,13 @@ unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
 
 unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
                              unsigned long long Offset) {
-  const StructType *STy = unwrap<StructType>(StructTy);
+  StructType *STy = unwrap<StructType>(StructTy);
   return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
 }
 
 unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
                                        unsigned Element) {
-  const StructType *STy = unwrap<StructType>(StructTy);
+  StructType *STy = unwrap<StructType>(StructTy);
   return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
 }
 
diff --git a/lib/Target/TargetAsmInfo.cpp b/lib/Target/TargetAsmInfo.cpp
deleted file mode 100644
index a97b0e8..0000000
--- a/lib/Target/TargetAsmInfo.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===-- llvm/Target/TargetAsmInfo.cpp - Target Assembly Info --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Target/TargetAsmInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-using namespace llvm;
-
-TargetAsmInfo::TargetAsmInfo(const TargetMachine &TM) {
-  TLOF = &TM.getTargetLowering()->getObjFileLowering();
-  TFI = TM.getFrameLowering();
-  TRI = TM.getRegisterInfo();
-  TFI->getInitialFrameState(InitialFrameState);
-}
diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
index 17d022a..bd6a6b6 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/Target/TargetData.cpp
@@ -41,7 +41,7 @@ char TargetData::ID = 0;
 // Support for StructLayout
 //===----------------------------------------------------------------------===//
 
-StructLayout::StructLayout(const StructType *ST, const TargetData &TD) {
+StructLayout::StructLayout(StructType *ST, const TargetData &TD) {
   assert(!ST->isOpaque() && "Cannot get layout of opaque structs");
   StructAlignment = 0;
   StructSize = 0;
@@ -49,7 +49,7 @@ StructLayout::StructLayout(const StructType *ST, const TargetData &TD) {
 
   // Loop over each of the elements, placing them in memory.
   for (unsigned i = 0, e = NumElements; i != e; ++i) {
-    const Type *Ty = ST->getElementType(i);
+    Type *Ty = ST->getElementType(i);
     unsigned TyAlign = ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty);
 
     // Add padding if necessary to align the data element properly.
@@ -139,6 +139,7 @@ void TargetData::init(StringRef Desc) {
   PointerMemSize = 8;
   PointerABIAlign = 8;
   PointerPrefAlign = PointerABIAlign;
+  StackNaturalAlign = 0;
 
   // Default alignments
   setAlignment(INTEGER_ALIGN,   1,  1, 1);   // i1
@@ -218,7 +219,12 @@ void TargetData::init(StringRef Desc) {
         Token = Split.second;
       } while (!Specifier.empty() || !Token.empty());
       break;
-
+    case 'S': // Stack natural alignment.
+      StackNaturalAlign = getInt(Specifier.substr(1));
+      StackNaturalAlign /= 8;
+      // FIXME: Should we really be truncating these alingments and
+      // sizes silently?
+      break;
     default:
       break;
     }
@@ -261,7 +267,7 @@ TargetData::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
 /// preferred if ABIInfo = false) the target wants for the specified datatype.
 unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
                                       uint32_t BitWidth, bool ABIInfo,
-                                      const Type *Ty) const {
+                                      Type *Ty) const {
   // Check to see if we have an exact match and remember the best match we see.
   int BestMatchIdx = -1;
   int LargestInt = -1;
@@ -315,7 +321,7 @@ unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
 namespace {
 
 class StructLayoutMap {
-  typedef DenseMap<const StructType*, StructLayout*> LayoutInfoTy;
+  typedef DenseMap<StructType*, StructLayout*> LayoutInfoTy;
   LayoutInfoTy LayoutInfo;
 
 public:
@@ -329,7 +335,7 @@ public:
     }
   }
 
-  StructLayout *&operator[](const StructType *STy) {
+  StructLayout *&operator[](StructType *STy) {
     return LayoutInfo[STy];
   }
 
@@ -343,7 +349,7 @@ TargetData::~TargetData() {
   delete static_cast<StructLayoutMap*>(LayoutMap);
 }
 
-const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
+const StructLayout *TargetData::getStructLayout(StructType *Ty) const {
   if (!LayoutMap)
     LayoutMap = new StructLayoutMap();
 
@@ -372,7 +378,9 @@ std::string TargetData::getStringRepresentation() const {
 
   OS << (LittleEndian ? "e" : "E")
      << "-p:" << PointerMemSize*8 << ':' << PointerABIAlign*8
-     << ':' << PointerPrefAlign*8;
+     << ':' << PointerPrefAlign*8
+     << "-S" << StackNaturalAlign*8;
+
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
     const TargetAlignElem &AI = Alignments[i];
     OS << '-' << (char)AI.AlignType << AI.TypeBitWidth << ':'
@@ -389,14 +397,14 @@ std::string TargetData::getStringRepresentation() const {
 }
 
 
-uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
+uint64_t TargetData::getTypeSizeInBits(Type *Ty) const {
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
   case Type::LabelTyID:
   case Type::PointerTyID:
     return getPointerSizeInBits();
   case Type::ArrayTyID: {
-    const ArrayType *ATy = cast<ArrayType>(Ty);
+    ArrayType *ATy = cast<ArrayType>(Ty);
     return getTypeAllocSizeInBits(ATy->getElementType())*ATy->getNumElements();
   }
   case Type::StructTyID:
@@ -435,7 +443,7 @@ uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
   Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
   == false) for the requested type \a Ty.
  */
-unsigned TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
+unsigned TargetData::getAlignment(Type *Ty, bool abi_or_pref) const {
   int AlignType = -1;
 
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
@@ -485,7 +493,7 @@ unsigned TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
                           abi_or_pref, Ty);
 }
 
-unsigned TargetData::getABITypeAlignment(const Type *Ty) const {
+unsigned TargetData::getABITypeAlignment(Type *Ty) const {
   return getAlignment(Ty, true);
 }
 
@@ -496,7 +504,7 @@ unsigned TargetData::getABIIntegerTypeAlignment(unsigned BitWidth) const {
 }
 
 
-unsigned TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
+unsigned TargetData::getCallFrameTypeAlignment(Type *Ty) const {
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
     if (Alignments[i].AlignType == STACK_ALIGN)
       return Alignments[i].ABIAlign;
@@ -504,11 +512,11 @@ unsigned TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
   return getABITypeAlignment(Ty);
 }
 
-unsigned TargetData::getPrefTypeAlignment(const Type *Ty) const {
+unsigned TargetData::getPrefTypeAlignment(Type *Ty) const {
   return getAlignment(Ty, false);
 }
 
-unsigned TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
+unsigned TargetData::getPreferredTypeAlignmentShift(Type *Ty) const {
   unsigned Align = getPrefTypeAlignment(Ty);
   assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
   return Log2_32(Align);
@@ -521,16 +529,17 @@ IntegerType *TargetData::getIntPtrType(LLVMContext &C) const {
 }
 
 
-uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
-                                      unsigned NumIndices) const {
-  const Type *Ty = ptrTy;
+uint64_t TargetData::getIndexedOffset(Type *ptrTy,
+                                      ArrayRef<Value *> Indices) const {
+  Type *Ty = ptrTy;
   assert(Ty->isPointerTy() && "Illegal argument for getIndexedOffset()");
   uint64_t Result = 0;
 
   generic_gep_type_iterator<Value* const*>
-    TI = gep_type_begin(ptrTy, Indices, Indices+NumIndices);
-  for (unsigned CurIDX = 0; CurIDX != NumIndices; ++CurIDX, ++TI) {
-    if (const StructType *STy = dyn_cast<StructType>(*TI)) {
+    TI = gep_type_begin(ptrTy, Indices);
+  for (unsigned CurIDX = 0, EndIDX = Indices.size(); CurIDX != EndIDX;
+       ++CurIDX, ++TI) {
+    if (StructType *STy = dyn_cast<StructType>(*TI)) {
       assert(Indices[CurIDX]->getType() ==
              Type::getInt32Ty(ptrTy->getContext()) &&
              "Illegal struct idx");
@@ -561,7 +570,7 @@ uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
 /// global.  This includes an explicitly requested alignment (if the global
 /// has one).
 unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
-  const Type *ElemType = GV->getType()->getElementType();
+  Type *ElemType = GV->getType()->getElementType();
   unsigned Alignment = getPrefTypeAlignment(ElemType);
   unsigned GVAlignment = GV->getAlignment();
   if (GVAlignment >= Alignment) {
diff --git a/lib/Target/TargetFrameLowering.cpp b/lib/Target/TargetFrameLowering.cpp
index 19fd581..122f869 100644
--- a/lib/Target/TargetFrameLowering.cpp
+++ b/lib/Target/TargetFrameLowering.cpp
@@ -23,14 +23,6 @@ using namespace llvm;
 TargetFrameLowering::~TargetFrameLowering() {
 }
 
-/// getInitialFrameState - Returns a list of machine moves that are assumed
-/// on entry to a function.
-void
-TargetFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves)
-                                                                         const {
-  // Default is to do nothing.
-}
-
 /// getFrameIndexOffset - Returns the displacement from the frame register to
 /// the stack frame of the specified index. This is the default implementation
 /// which is overridden for some targets.
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 703431b..56b7b69 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -35,34 +35,16 @@ using namespace llvm;
 //                              Generic Code
 //===----------------------------------------------------------------------===//
 
-TargetLoweringObjectFile::TargetLoweringObjectFile() :
-  Ctx(0),
-  TextSection(0),
-  DataSection(0),
-  BSSSection(0),
-  ReadOnlySection(0),
-  StaticCtorSection(0),
-  StaticDtorSection(0),
-  LSDASection(0),
-  CompactUnwindSection(0),
-  DwarfAbbrevSection(0),
-  DwarfInfoSection(0),
-  DwarfLineSection(0),
-  DwarfFrameSection(0),
-  DwarfPubNamesSection(0),
-  DwarfPubTypesSection(0),
-  DwarfDebugInlineSection(0),
-  DwarfStrSection(0),
-  DwarfLocSection(0),
-  DwarfARangesSection(0),
-  DwarfRangesSection(0),
-  DwarfMacroInfoSection(0),
-  TLSExtraDataSection(0),
-  CommDirectiveSupportsAlignment(true),
-  SupportsWeakOmittedEHFrame(true), 
-  IsFunctionEHFrameSymbolPrivate(true) {
+/// Initialize - this method must be called before any actual lowering is
+/// done.  This specifies the current context for codegen, and gives the
+/// lowering implementations a chance to set up their default sections.
+void TargetLoweringObjectFile::Initialize(MCContext &ctx,
+                                          const TargetMachine &TM) {
+  Ctx = &ctx;
+  InitMCObjectFileInfo(TM.getTargetTriple(),
+                       TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
 }
-
+  
 TargetLoweringObjectFile::~TargetLoweringObjectFile() {
 }
 
@@ -93,7 +75,7 @@ static bool isSuitableForBSS(const GlobalVariable *GV) {
 /// known to have a type that is an array of 1/2/4 byte elements) ends with a
 /// nul value and contains no other nuls in it.
 static bool IsNullTerminatedString(const Constant *C) {
-  const ArrayType *ATy = cast<ArrayType>(C->getType());
+  ArrayType *ATy = cast<ArrayType>(C->getType());
 
   // First check: is we have constant array of i8 terminated with zero
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(C)) {
@@ -188,8 +170,8 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
         
       // If initializer is a null-terminated string, put it in a "cstring"
       // section of the right width.
-      if (const ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
-        if (const IntegerType *ITy =
+      if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        if (IntegerType *ITy =
               dyn_cast<IntegerType>(ATy->getElementType())) {
           if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
                ITy->getBitWidth() == 32) &&
@@ -341,20 +323,3 @@ getExprForDwarfReference(const MCSymbol *Sym, unsigned Encoding,
   }
   }
 }
-
-unsigned TargetLoweringObjectFile::getPersonalityEncoding() const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
-unsigned TargetLoweringObjectFile::getLSDAEncoding() const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
-unsigned TargetLoweringObjectFile::getFDEEncoding(bool CFI) const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
-unsigned TargetLoweringObjectFile::getTTypeEncoding() const {
-  return dwarf::DW_EH_PE_absptr;
-}
-
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 74a1f4e..fe8a7ce 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -40,8 +40,6 @@ namespace llvm {
   bool JITExceptionHandling;
   bool JITEmitDebugInfo;
   bool JITEmitDebugInfoToDisk;
-  Reloc::Model RelocationModel;
-  CodeModel::Model CMModel;
   bool GuaranteedTailCallOpt;
   unsigned StackAlignmentOverride;
   bool RealignStack;
@@ -49,6 +47,7 @@ namespace llvm {
   bool StrongPHIElim;
   bool HasDivModLibcall;
   bool AsmVerbosityDefault(false);
+  bool EnableSegmentedStacks;
 }
 
 static cl::opt<bool, true>
@@ -143,38 +142,6 @@ EmitJitDebugInfoToDisk("jit-emit-debug-to-disk",
   cl::location(JITEmitDebugInfoToDisk),
   cl::init(false));
 
-static cl::opt<llvm::Reloc::Model, true>
-DefRelocationModel("relocation-model",
-  cl::desc("Choose relocation model"),
-  cl::location(RelocationModel),
-  cl::init(Reloc::Default),
-  cl::values(
-    clEnumValN(Reloc::Default, "default",
-               "Target default relocation model"),
-    clEnumValN(Reloc::Static, "static",
-               "Non-relocatable code"),
-    clEnumValN(Reloc::PIC_, "pic",
-               "Fully relocatable, position independent code"),
-    clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
-               "Relocatable external references, non-relocatable code"),
-    clEnumValEnd));
-static cl::opt<llvm::CodeModel::Model, true>
-DefCodeModel("code-model",
-  cl::desc("Choose code model"),
-  cl::location(CMModel),
-  cl::init(CodeModel::Default),
-  cl::values(
-    clEnumValN(CodeModel::Default, "default",
-               "Target default code model"),
-    clEnumValN(CodeModel::Small, "small",
-               "Small code model"),
-    clEnumValN(CodeModel::Kernel, "kernel",
-               "Kernel code model"),
-    clEnumValN(CodeModel::Medium, "medium",
-               "Medium code model"),
-    clEnumValN(CodeModel::Large, "large",
-               "Large code model"),
-    clEnumValEnd));
 static cl::opt<bool, true>
 EnableGuaranteedTailCallOpt("tailcallopt",
   cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
@@ -212,13 +179,20 @@ static cl::opt<bool>
 FunctionSections("ffunction-sections",
   cl::desc("Emit functions into separate sections"),
   cl::init(false));
+static cl::opt<bool, true>
+SegmentedStacks("segmented-stacks",
+  cl::desc("Use segmented stacks if possible."),
+  cl::location(EnableSegmentedStacks),
+  cl::init(false));
+                         
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
 
 TargetMachine::TargetMachine(const Target &T,
                              StringRef TT, StringRef CPU, StringRef FS)
-  : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS), AsmInfo(0),
+  : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS),
+    CodeGenInfo(0), AsmInfo(0),
     MCRelaxAll(false),
     MCNoExecStack(false),
     MCSaveTempLabels(false),
@@ -231,29 +205,24 @@ TargetMachine::TargetMachine(const Target &T,
 }
 
 TargetMachine::~TargetMachine() {
+  delete CodeGenInfo;
   delete AsmInfo;
 }
 
 /// getRelocationModel - Returns the code generation relocation model. The
 /// choices are static, PIC, and dynamic-no-pic, and target default.
-Reloc::Model TargetMachine::getRelocationModel() {
-  return RelocationModel;
-}
-
-/// setRelocationModel - Sets the code generation relocation model.
-void TargetMachine::setRelocationModel(Reloc::Model Model) {
-  RelocationModel = Model;
+Reloc::Model TargetMachine::getRelocationModel() const {
+  if (!CodeGenInfo)
+    return Reloc::Default;
+  return CodeGenInfo->getRelocationModel();
 }
 
 /// getCodeModel - Returns the code model. The choices are small, kernel,
 /// medium, large, and target default.
-CodeModel::Model TargetMachine::getCodeModel() {
-  return CMModel;
-}
-
-/// setCodeModel - Sets the code model.
-void TargetMachine::setCodeModel(CodeModel::Model Model) {
-  CMModel = Model;
+CodeModel::Model TargetMachine::getCodeModel() const {
+  if (!CodeGenInfo)
+    return CodeModel::Default;
+  return CodeGenInfo->getCodeModel();
 }
 
 bool TargetMachine::getAsmVerbosityDefault() {
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 90a8f8d..67239b8 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -98,44 +98,25 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
 }
 
 const TargetRegisterClass *
-llvm::getCommonSubClass(const TargetRegisterClass *A,
-                        const TargetRegisterClass *B) {
-  // First take care of the trivial cases
+TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
+                                      const TargetRegisterClass *B) const {
+  // First take care of the trivial cases.
   if (A == B)
     return A;
   if (!A || !B)
     return 0;
 
-  // If B is a subclass of A, it will be handled in the loop below
-  if (B->hasSubClass(A))
-    return A;
+  // Register classes are ordered topologically, so the largest common
+  // sub-class it the common sub-class with the smallest ID.
+  const unsigned *SubA = A->getSubClassMask();
+  const unsigned *SubB = B->getSubClassMask();
 
-  const TargetRegisterClass *Best = 0;
-  for (TargetRegisterClass::sc_iterator I = A->subclasses_begin();
-       const TargetRegisterClass *X = *I; ++I) {
-    if (X == B)
-      return B;                 // B is a subclass of A
-
-    // X must be a common subclass of A and B
-    if (!B->hasSubClass(X))
-      continue;
-
-    // A superclass is definitely better.
-    if (!Best || Best->hasSuperClass(X)) {
-      Best = X;
-      continue;
-    }
-
-    // A subclass is definitely worse
-    if (Best->hasSubClass(X))
-      continue;
-
-    // Best and *I have no super/sub class relation - pick the larger class, or
-    // the smaller spill size.
-    int nb = std::distance(Best->begin(), Best->end());
-    int ni = std::distance(X->begin(), X->end());
-    if (ni>nb || (ni==nb && X->getSize() < Best->getSize()))
-      Best = X;
-  }
-  return Best;
+  // We could start the search from max(A.ID, B.ID), but we are only going to
+  // execute 2-3 iterations anyway.
+  for (unsigned Base = 0, BaseE = getNumRegClasses(); Base < BaseE; Base += 32)
+    if (unsigned Common = *SubA++ & *SubB++)
+      return getRegClass(Base + CountTrailingZeros_32(Common));
+
+  // No common sub-class exists.
+  return NULL;
 }
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
index 40dbdd7..94aca7a 100644
--- a/lib/Target/X86/AsmParser/CMakeLists.txt
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -4,4 +4,13 @@ add_llvm_library(LLVMX86AsmParser
   X86AsmLexer.cpp
   X86AsmParser.cpp
   )
-add_dependencies(LLVMX86AsmParser X86CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMX86AsmParser
+  LLVMMC
+  LLVMMCParser
+  LLVMSupport
+  LLVMX86Desc
+  LLVMX86Info
+  )
+
+add_dependencies(LLVMX86AsmParser X86CommonTableGen)
diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
index ec73087..1eaccff 100644
--- a/lib/Target/X86/AsmParser/X86AsmLexer.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
@@ -7,20 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Target/TargetAsmLexer.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "X86.h"
+#include "llvm/MC/MCTargetAsmLexer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
 
 namespace {
   
-class X86AsmLexer : public TargetAsmLexer {
+class X86AsmLexer : public MCTargetAsmLexer {
   const MCAsmInfo &AsmInfo;
   
   bool tentativeIsValid;
@@ -60,8 +60,8 @@ protected:
     }
   }
 public:
-  X86AsmLexer(const Target &T, const MCAsmInfo &MAI)
-    : TargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) {
+  X86AsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI)
+    : MCTargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) {
   }
 };
 
@@ -160,6 +160,6 @@ AsmToken X86AsmLexer::LexTokenIntel() {
 }
 
 extern "C" void LLVMInitializeX86AsmLexer() {
-  RegisterAsmLexer<X86AsmLexer> X(TheX86_32Target);
-  RegisterAsmLexer<X86AsmLexer> Y(TheX86_64Target);
+  RegisterMCAsmLexer<X86AsmLexer> X(TheX86_32Target);
+  RegisterMCAsmLexer<X86AsmLexer> Y(TheX86_64Target);
 }
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d45dd35..cb4f15f 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -7,14 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmParser.h"
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmParser.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
@@ -26,6 +24,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -33,7 +32,7 @@ using namespace llvm;
 namespace {
 struct X86Operand;
 
-class X86ATTAsmParser : public TargetAsmParser {
+class X86ATTAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
@@ -48,6 +47,7 @@ private:
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
+  bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
@@ -65,6 +65,10 @@ private:
     // FIXME: Can tablegen auto-generate this?
     return (STI.getFeatureBits() & X86::Mode64Bit) != 0;
   }
+  void SwitchMode() {
+    unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(X86::Mode64Bit));
+    setAvailableFeatures(FB);
+  }
 
   /// @name Auto-generated Matcher Functions
   /// {
@@ -76,7 +80,7 @@ private:
 
 public:
   X86ATTAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : TargetAsmParser(), STI(sti), Parser(parser) {
+    : MCTargetAsmParser(), STI(sti), Parser(parser) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -223,6 +227,21 @@ struct X86Operand : public MCParsedAsmOperand {
             (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)||
             (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL));
   }
+  bool isImmZExtu32u8() const {
+    if (!isImm())
+      return false;
+
+    // If this isn't a constant expr, just assume it fits and let relaxation
+    // handle it.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return true;
+
+    // Otherwise, check the value is in a range that makes sense for this
+    // extension.
+    uint64_t Value = CE->getValue();
+    return (Value <= 0x00000000000000FFULL);
+  }
   bool isImmSExti64i8() const {
     if (!isImm())
       return false;
@@ -382,19 +401,25 @@ bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
   if (Tok.isNot(AsmToken::Identifier))
     return Error(Tok.getLoc(), "invalid register name");
 
-  // FIXME: Validate register for the current architecture; we have to do
-  // validation later, so maybe there is no need for this here.
   RegNo = MatchRegisterName(Tok.getString());
 
   // If the match failed, try the register name as lowercase.
   if (RegNo == 0)
     RegNo = MatchRegisterName(LowercaseString(Tok.getString()));
 
-  // FIXME: This should be done using Requires<In32BitMode> and
-  // Requires<In64BitMode> so "eiz" usage in 64-bit instructions
-  // can be also checked.
-  if (RegNo == X86::RIZ && !is64BitMode())
-    return Error(Tok.getLoc(), "riz register in 64-bit mode only");
+  if (!is64BitMode()) {
+    // FIXME: This should be done using Requires<In32BitMode> and
+    // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also
+    // checked.
+    // FIXME: Check AH, CH, DH, BH cannot be used in an instruction requiring a
+    // REX prefix.
+    if (RegNo == X86::RIZ ||
+        X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
+        X86II::isX86_64NonExtLowByteReg(RegNo) ||
+        X86II::isX86_64ExtendedReg(RegNo))
+      return Error(Tok.getLoc(), "register %"
+                   + Tok.getString() + " is only available in 64-bit mode");
+  }
 
   // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
   if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
@@ -472,7 +497,7 @@ X86Operand *X86ATTAsmParser::ParseOperand() {
     SMLoc Start, End;
     if (ParseRegister(RegNo, Start, End)) return 0;
     if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
-      Error(Start, "eiz and riz can only be used as index registers");
+      Error(Start, "%eiz and %riz can only be used as index registers");
       return 0;
     }
 
@@ -956,6 +981,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
 
   // First, try a direct match.
   switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo)) {
+  default: break;
   case Match_Success:
     Out.EmitInstruction(Inst);
     return false;
@@ -994,7 +1020,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   // Check for the various suffix matches.
   Tmp[Base.size()] = Suffixes[0];
   unsigned ErrorInfoIgnore;
-  MatchResultTy Match1, Match2, Match3, Match4;
+  unsigned Match1, Match2, Match3, Match4;
   
   Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
   Tmp[Base.size()] = Suffixes[1];
@@ -1096,6 +1122,8 @@ bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
     return ParseDirectiveWord(2, DirectiveID.getLoc());
+  else if (IDVal.startswith(".code"))
+    return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
   return true;
 }
 
@@ -1124,15 +1152,35 @@ bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// ParseDirectiveCode
+///  ::= .code32 | .code64
+bool X86ATTAsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
+  if (IDVal == ".code32") {
+    Parser.Lex();
+    if (is64BitMode()) {
+      SwitchMode();
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+    }
+  } else if (IDVal == ".code64") {
+    Parser.Lex();
+    if (!is64BitMode()) {
+      SwitchMode();
+      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+    }
+  } else {
+    return Error(L, "unexpected directive " + IDVal);
+  }
 
+  return false;
+}
 
 
 extern "C" void LLVMInitializeX86AsmLexer();
 
 // Force static initialization.
 extern "C" void LLVMInitializeX86AsmParser() {
-  RegisterAsmParser<X86ATTAsmParser> X(TheX86_32Target);
-  RegisterAsmParser<X86ATTAsmParser> Y(TheX86_64Target);
+  RegisterMCAsmParser<X86ATTAsmParser> X(TheX86_32Target);
+  RegisterMCAsmParser<X86ATTAsmParser> Y(TheX86_64Target);
   LLVMInitializeX86AsmLexer();
 }
 
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index b112f9f..351e767 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -1,20 +1,19 @@
 set(LLVM_TARGET_DEFINITIONS X86.td)
 
-tablegen(X86GenRegisterInfo.inc -gen-register-info)
-tablegen(X86GenDisassemblerTables.inc -gen-disassembler)
-tablegen(X86GenInstrInfo.inc -gen-instr-info)
-tablegen(X86GenAsmWriter.inc -gen-asm-writer)
-tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(X86GenDAGISel.inc -gen-dag-isel)
-tablegen(X86GenFastISel.inc -gen-fast-isel)
-tablegen(X86GenCallingConv.inc -gen-callingconv)
-tablegen(X86GenSubtargetInfo.inc -gen-subtarget)
-tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info)
+llvm_tablegen(X86GenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(X86GenDisassemblerTables.inc -gen-disassembler)
+llvm_tablegen(X86GenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(X86GenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+llvm_tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)
+llvm_tablegen(X86GenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(X86GenFastISel.inc -gen-fast-isel)
+llvm_tablegen(X86GenCallingConv.inc -gen-callingconv)
+llvm_tablegen(X86GenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(X86GenEDInfo.inc -gen-enhanced-disassembly-info)
+add_public_tablegen_target(X86CommonTableGen)
 
 set(sources
-  SSEDomainFix.cpp
-  X86AsmBackend.cpp
   X86AsmPrinter.cpp
   X86COFFMachineModuleInfo.cpp
   X86CodeEmitter.cpp
@@ -26,14 +25,13 @@ set(sources
   X86ISelLowering.cpp
   X86InstrInfo.cpp
   X86JITInfo.cpp
-  X86MachObjectWriter.cpp
-  X86MCCodeEmitter.cpp 
   X86MCInstLower.cpp
   X86RegisterInfo.cpp
   X86SelectionDAGInfo.cpp
   X86Subtarget.cpp
   X86TargetMachine.cpp
   X86TargetObjectFile.cpp
+  X86VZeroUpper.cpp
   )
 
 if( CMAKE_CL_64 )
@@ -53,6 +51,19 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
+add_llvm_library_dependencies(LLVMX86CodeGen
+  LLVMAnalysis
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  LLVMX86AsmPrinter
+  LLVMX86Desc
+  )
+
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index 972a0d9..4f570d5 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -4,6 +4,13 @@ add_llvm_library(LLVMX86Disassembler
   X86Disassembler.cpp
   X86DisassemblerDecoder.c
   )
+
+add_llvm_library_dependencies(LLVMX86Disassembler
+  LLVMMC
+  LLVMSupport
+  LLVMX86Info
+  )
+
 # workaround for hanging compilation on MSVC9 and 10
 if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
 set_property(
@@ -11,4 +18,5 @@ set_property(
   PROPERTY COMPILE_FLAGS "/Od"
   )
 endif()
-add_dependencies(LLVMX86Disassembler X86CodeGenTable_gen)
+
+add_dependencies(LLVMX86Disassembler X86CommonTableGen)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4a0d2ec..3aacb20 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -21,13 +21,16 @@
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_REGINFO_ENUM
 #include "X86GenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "X86GenInstrInfo.inc"
 #include "X86GenEDInfo.inc"
 
 using namespace llvm;
@@ -64,8 +67,8 @@ extern Target TheX86_32Target, TheX86_64Target;
 static bool translateInstruction(MCInst &target,
                                 InternalInstruction &source);
 
-X86GenericDisassembler::X86GenericDisassembler(DisassemblerMode mode) :
-    MCDisassembler(),
+X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode) :
+    MCDisassembler(STI),
     fMode(mode) {
 }
 
@@ -106,28 +109,34 @@ static void logger(void* arg, const char* log) {
 // Public interface for the disassembler
 //
 
-bool X86GenericDisassembler::getInstruction(MCInst &instr,
-                                            uint64_t &size,
-                                            const MemoryObject &region,
-                                            uint64_t address,
-                                            raw_ostream &vStream) const {
+MCDisassembler::DecodeStatus
+X86GenericDisassembler::getInstruction(MCInst &instr,
+                                       uint64_t &size,
+                                       const MemoryObject &region,
+                                       uint64_t address,
+                                       raw_ostream &vStream,
+                                       raw_ostream &cStream) const {
   InternalInstruction internalInstr;
+
+  dlog_t loggerFn = logger;
+  if (&vStream == &nulls())
+    loggerFn = 0; // Disable logging completely if it's going to nulls().
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
                               (void*)&region,
-                              logger,
+                              loggerFn,
                               (void*)&vStream,
                               address,
                               fMode);
 
   if (ret) {
     size = internalInstr.readerCursor - address;
-    return false;
+    return Fail;
   }
   else {
     size = internalInstr.length;
-    return !translateInstruction(instr, internalInstr);
+    return (!translateInstruction(instr, internalInstr)) ? Success : Fail;
   }
 }
 
@@ -183,8 +192,46 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
       break;
     }
   }
+  // By default sign-extend all X86 immediates based on their encoding.
+  else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
+           type == TYPE_IMM64) {
+    uint32_t Opcode = mcInst.getOpcode();
+    switch (operand.encoding) {
+    default:
+      break;
+    case ENCODING_IB:
+      // Special case those X86 instructions that use the imm8 as a set of
+      // bits, bit count, etc. and are not sign-extend.
+      if (Opcode != X86::BLENDPSrri && Opcode != X86::BLENDPDrri &&
+	  Opcode != X86::PBLENDWrri && Opcode != X86::MPSADBWrri &&
+	  Opcode != X86::DPPSrri && Opcode != X86::DPPDrri &&
+	  Opcode != X86::INSERTPSrr && Opcode != X86::VBLENDPSYrri &&
+	  Opcode != X86::VBLENDPSYrmi && Opcode != X86::VBLENDPDYrri &&
+	  Opcode != X86::VBLENDPDYrmi && Opcode != X86::VPBLENDWrri &&
+	  Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
+	  Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
+	  Opcode != X86::VINSERTPSrr)
+	type = TYPE_MOFFS8;
+      break;
+    case ENCODING_IW:
+      type = TYPE_MOFFS16;
+      break;
+    case ENCODING_ID:
+      type = TYPE_MOFFS32;
+      break;
+    case ENCODING_IO:
+      type = TYPE_MOFFS64;
+      break;
+    }
+  }
 
   switch (type) {
+  case TYPE_XMM128:
+    mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
+    return;
+  case TYPE_XMM256:
+    mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
+    return;
   case TYPE_MOFFS8:
   case TYPE_REL8:
     if(immediate & 0x80)
@@ -543,12 +590,12 @@ static bool translateInstruction(MCInst &mcInst,
   return false;
 }
 
-static MCDisassembler *createX86_32Disassembler(const Target &T) {
-  return new X86Disassembler::X86_32Disassembler;
+static MCDisassembler *createX86_32Disassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new X86Disassembler::X86_32Disassembler(STI);
 }
 
-static MCDisassembler *createX86_64Disassembler(const Target &T) {
-  return new X86Disassembler::X86_64Disassembler;
+static MCDisassembler *createX86_64Disassembler(const Target &T, const MCSubtargetInfo &STI) {
+  return new X86Disassembler::X86_64Disassembler(STI);
 }
 
 extern "C" void LLVMInitializeX86Disassembler() { 
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 550cf9d..6ac9a0f 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -92,6 +92,7 @@ struct InternalInstruction;
 namespace llvm {
   
 class MCInst;
+class MCSubtargetInfo;
 class MemoryObject;
 class raw_ostream;
 
@@ -107,16 +108,17 @@ protected:
   /// Constructor     - Initializes the disassembler.
   ///
   /// @param mode     - The X86 architecture mode to decode for.
-  X86GenericDisassembler(DisassemblerMode mode);
+  X86GenericDisassembler(const MCSubtargetInfo &STI, DisassemblerMode mode);
 public:
   ~X86GenericDisassembler();
 
   /// getInstruction - See MCDisassembler.
-  bool getInstruction(MCInst &instr,
-                      uint64_t &size,
-                      const MemoryObject &region,
-                      uint64_t address,
-                      raw_ostream &vStream) const;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const;
 
   /// getEDInfo - See MCDisassembler.
   EDInstInfo *getEDInfo() const;
@@ -127,24 +129,24 @@ private:
 /// X86_16Disassembler - 16-bit X86 disassembler.
 class X86_16Disassembler : public X86GenericDisassembler {
 public:
-  X86_16Disassembler() :
-    X86GenericDisassembler(MODE_16BIT) {
+  X86_16Disassembler(const MCSubtargetInfo &STI) :
+    X86GenericDisassembler(STI, MODE_16BIT) {
   }
 };  
 
 /// X86_16Disassembler - 32-bit X86 disassembler.
 class X86_32Disassembler : public X86GenericDisassembler {
 public:
-  X86_32Disassembler() :
-    X86GenericDisassembler(MODE_32BIT) {
+  X86_32Disassembler(const MCSubtargetInfo &STI) :
+    X86GenericDisassembler(STI, MODE_32BIT) {
   }
 };
 
 /// X86_16Disassembler - 64-bit X86 disassembler.
 class X86_64Disassembler : public X86GenericDisassembler {
 public:
-  X86_64Disassembler() :
-    X86GenericDisassembler(MODE_64BIT) {
+  X86_64Disassembler(const MCSubtargetInfo &STI) :
+    X86GenericDisassembler(STI, MODE_64BIT) {
   }
 };
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index de1610b..f9b0fe5 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -58,8 +58,8 @@ static InstructionContext contextForAttrs(uint8_t attrMask) {
  * @return            - TRUE if the ModR/M byte is required, FALSE otherwise.
  */
 static int modRMRequired(OpcodeType type,
-                                InstructionContext insnContext,
-                                uint8_t opcode) {
+                         InstructionContext insnContext,
+                         uint8_t opcode) {
   const struct ContextDecision* decision = 0;
   
   switch (type) {
@@ -391,7 +391,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       return -1;
     }
     
-    if (insn->mode == MODE_64BIT || byte1 & 0x8) {
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
       insn->vexSize = 3;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
@@ -406,12 +406,14 @@ static int readPrefixes(struct InternalInstruction* insn) {
       consumeByte(insn, &insn->vexPrefix[2]);
 
       /* We simulate the REX prefix for simplicity's sake */
-    
-      insn->rexPrefix = 0x40 
-                      | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
-                      | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
-                      | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
-                      | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+   
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40 
+                        | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+      }
     
       switch (ppFromVEX3of3(insn->vexPrefix[2]))
       {
@@ -433,7 +435,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       return -1;
     }
       
-    if (insn->mode == MODE_64BIT || byte1 & 0x8) {
+    if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
       insn->vexSize = 2;
     }
     else {
@@ -444,8 +446,10 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->vexPrefix[0] = byte;
       consumeByte(insn, &insn->vexPrefix[1]);
         
-      insn->rexPrefix = 0x40 
-                      | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40 
+                        | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+      }
         
       switch (ppFromVEX2of2(insn->vexPrefix[1]))
       {
@@ -700,34 +704,6 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
 }
 
 /*
- * is64BitEquivalent - Determines whether two instruction names refer to
- * equivalent instructions but one is 64-bit whereas the other is not.
- *
- * @param orig  - The instruction that is not 64-bit
- * @param equiv - The instruction that is 64-bit
- */
-static BOOL is64BitEquivalent(const char* orig, const char* equiv) {
-  off_t i;
-  
-  for (i = 0;; i++) {
-    if (orig[i] == '\0' && equiv[i] == '\0')
-      return TRUE;
-    if (orig[i] == '\0' || equiv[i] == '\0')
-      return FALSE;
-    if (orig[i] != equiv[i]) {
-      if ((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q')
-        continue;
-      if ((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6')
-        continue;
-      if ((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4')
-        continue;
-      return FALSE;
-    }
-  }
-}
-
-
-/*
  * getID - Determines the ID of an instruction, consuming the ModR/M byte as 
  *   appropriate for extended and escape opcodes.  Determines the attributes and 
  *   context for the instruction before doing so.
@@ -763,8 +739,6 @@ static int getID(struct InternalInstruction* insn) {
         break;
       }
     
-      if (wFromVEX3of3(insn->vexPrefix[2]))
-        attrMask |= ATTR_REXW;
       if (lFromVEX3of3(insn->vexPrefix[2]))
         attrMask |= ATTR_VEXL;
     }
@@ -789,63 +763,55 @@ static int getID(struct InternalInstruction* insn) {
     }
   }
   else {
-    if (insn->rexPrefix & 0x08)
-      attrMask |= ATTR_REXW;
-  
     if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
       attrMask |= ATTR_OPSIZE;
     else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
       attrMask |= ATTR_XS;
     else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
       attrMask |= ATTR_XD;
-    
   }
 
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+  
   if (getIDWithAttrMask(&instructionID, insn, attrMask))
     return -1;
   
   /* The following clauses compensate for limitations of the tables. */
   
-  if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) {
+  if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW)) {
     /*
-     * Although for SSE instructions it is usually necessary to treat REX.W+F2
-     * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is
-     * an occasional instruction where F2 is incidental and REX.W is the more
-     * significant.  If the decoded instruction is 32-bit and adding REX.W
-     * instead of F2 changes a 32 to a 64, we adopt the new encoding.
+     * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit
+     * has precedence since there are no L-bit with W-bit entries in the tables.
+     * So if the L-bit isn't significant we should use the W-bit instead.
      */
-    
+
     const struct InstructionSpecifier *spec;
-    uint16_t instructionIDWithREXw;
-    const struct InstructionSpecifier *specWithREXw;
-    
+    uint16_t instructionIDWithWBit;
+    const struct InstructionSpecifier *specWithWBit;
+
     spec = specifierForUID(instructionID);
-    
-    if (getIDWithAttrMask(&instructionIDWithREXw,
+
+    if (getIDWithAttrMask(&instructionIDWithWBit,
                           insn,
-                          attrMask & (~ATTR_XD))) {
-      /*
-       * Decoding with REX.w would yield nothing; give up and return original
-       * decode.
-       */
-      
+                          (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) {
       insn->instructionID = instructionID;
       insn->spec = spec;
       return 0;
     }
-    
-    specWithREXw = specifierForUID(instructionIDWithREXw);
-    
-    if (is64BitEquivalent(spec->name, specWithREXw->name)) {
-      insn->instructionID = instructionIDWithREXw;
-      insn->spec = specWithREXw;
+
+    specWithWBit = specifierForUID(instructionIDWithWBit);
+
+    if (instructionID != instructionIDWithWBit) {
+      insn->instructionID = instructionIDWithWBit;
+      insn->spec = specWithWBit;
     } else {
       insn->instructionID = instructionID;
       insn->spec = spec;
     }
     return 0;
   }
-  
+
   if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
     /*
      * The instruction tables make no distinction between instructions that
@@ -885,6 +851,43 @@ static int getID(struct InternalInstruction* insn) {
     }
     return 0;
   }
+
+  if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+      insn->rexPrefix & 0x01) {
+    /*
+     * NOOP shouldn't decode as NOOP if REX.b is set. Instead
+     * it should decode as XCHG %r8, %eax.
+     */
+
+    const struct InstructionSpecifier *spec;
+    uint16_t instructionIDWithNewOpcode;
+    const struct InstructionSpecifier *specWithNewOpcode;
+
+    spec = specifierForUID(instructionID);
+    
+    /* Borrow opcode from one of the other XCHGar opcodes */
+    insn->opcode = 0x91;
+   
+    if (getIDWithAttrMask(&instructionIDWithNewOpcode,
+                          insn,
+                          attrMask)) {
+      insn->opcode = 0x90;
+
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+
+    specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
+
+    /* Change back */
+    insn->opcode = 0x90;
+
+    insn->instructionID = instructionIDWithNewOpcode;
+    insn->spec = specWithNewOpcode;
+
+    return 0;
+  }
   
   insn->instructionID = instructionID;
   insn->spec = specifierForUID(insn->instructionID);
@@ -1434,11 +1437,10 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 }
 
 /*
- * readVVVV - Consumes an immediate operand from an instruction, given the
- *   desired operand size.
+ * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
  *
  * @param insn  - The instruction whose operand is to be read.
- * @return      - 0 if the immediate was successfully consumed; nonzero
+ * @return      - 0 if the vvvv was successfully consumed; nonzero
  *                otherwise.
  */
 static int readVVVV(struct InternalInstruction* insn) {
@@ -1451,6 +1453,9 @@ static int readVVVV(struct InternalInstruction* insn) {
   else
     return -1;
 
+  if (insn->mode != MODE_64BIT)
+    insn->vvvv &= 0x7;
+
   return 0;
 }
 
@@ -1463,8 +1468,14 @@ static int readVVVV(struct InternalInstruction* insn) {
  */
 static int readOperands(struct InternalInstruction* insn) {
   int index;
+  int hasVVVV, needVVVV;
   
   dbgprintf(insn, "readOperands()");
+
+  /* If non-zero vvvv specified, need to make sure one of the operands
+     uses it. */
+  hasVVVV = !readVVVV(insn);
+  needVVVV = hasVVVV && (insn->vvvv != 0);
   
   for (index = 0; index < X86_MAX_OPERANDS; ++index) {
     switch (insn->spec->operands[index].encoding) {
@@ -1537,7 +1548,8 @@ static int readOperands(struct InternalInstruction* insn) {
         return -1;
       break;
     case ENCODING_VVVV:
-      if (readVVVV(insn))
+      needVVVV = 0; /* Mark that we have found a VVVV operand. */
+      if (!hasVVVV)
         return -1;
       if (fixupReg(insn, &insn->spec->operands[index]))
         return -1;
@@ -1549,6 +1561,9 @@ static int readOperands(struct InternalInstruction* insn) {
       return -1;
     }
   }
+
+  /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
+  if (needVVVV) return -1;
   
   return 0;
 }
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 70315ed..8b79335 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -81,12 +81,18 @@ enum attributeBits {
                                         "but not the operands")                \
   ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
                                         "but not the operands")                \
+  ENUM_ENTRY(IC_XD_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XS_OPSIZE,          3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
   ENUM_ENTRY(IC_64BIT_REXW,         4,  "requires a REX.W prefix, so operands "\
                                         "change width; overrides IC_OPSIZE")   \
   ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
   ENUM_ENTRY(IC_64BIT_XD,           5,  "XD instructions are SSE; REX.W is "   \
                                         "secondary")                           \
   ENUM_ENTRY(IC_64BIT_XS,           5,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_XD_OPSIZE,    3,  "Just as meaningful as IC_XD_OPSIZE")  \
+  ENUM_ENTRY(IC_64BIT_XS_OPSIZE,    3,  "Just as meaningful as IC_XS_OPSIZE")  \
   ENUM_ENTRY(IC_64BIT_REXW_XS,      6,  "OPSIZE could mean a different "       \
                                         "opcode")                              \
   ENUM_ENTRY(IC_64BIT_REXW_XD,      6,  "Just as meaningful as "               \
@@ -104,7 +110,7 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_W_OPSIZE,       4,  "requires VEX, W, and OpSize")         \
   ENUM_ENTRY(IC_VEX_L,              3,  "requires VEX and the L prefix")       \
   ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
-  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
   ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")
 
 
diff --git a/lib/Target/X86/InstPrinter/CMakeLists.txt b/lib/Target/X86/InstPrinter/CMakeLists.txt
index 033973e..2a2b5db 100644
--- a/lib/Target/X86/InstPrinter/CMakeLists.txt
+++ b/lib/Target/X86/InstPrinter/CMakeLists.txt
@@ -5,4 +5,11 @@ add_llvm_library(LLVMX86AsmPrinter
   X86IntelInstPrinter.cpp
   X86InstComments.cpp
   )
-add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMX86AsmPrinter
+  LLVMMC
+  LLVMSupport
+  LLVMX86Utils
+  )
+
+add_dependencies(LLVMX86AsmPrinter X86CommonTableGen)
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index c37d879..029d491 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -39,14 +39,17 @@ void X86ATTInstPrinter::printRegName(raw_ostream &OS,
   OS << '%' << getRegisterName(RegNo);
 }
 
-void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
+void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                  StringRef Annot) {
   // Try to print any aliases first.
   if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
   
   // If verbose assembly is enabled, we can print some informative comments.
-  if (CommentStream)
+  if (CommentStream) {
+    printAnnotation(OS, Annot);
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+  }
 }
 
 StringRef X86ATTInstPrinter::getOpcodeName(unsigned Opcode) const {
@@ -90,7 +93,8 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     O << '%' << getRegisterName(Op.getReg());
   } else if (Op.isImm()) {
-    O << '$' << Op.getImm();
+    // Print X86 immediates as signed values.
+    O << '$' << (int64_t)Op.getImm();
     
     if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
       *CommentStream << format("imm = 0x%llX\n", (long long)Op.getImm());
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 5426e5c..0293869 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -25,7 +25,7 @@ public:
   X86ATTInstPrinter(const MCAsmInfo &MAI);
   
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS);
+  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
 
   // Autogenerated by tblgen, returns true if we successfully printed an
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 4e28dfe..8d85b95 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -14,9 +14,9 @@
 
 #include "X86InstComments.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "Utils/X86ShuffleDecode.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/raw_ostream.h"
-#include "../Utils/X86ShuffleDecode.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -136,9 +136,11 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     break;
 
   case X86::SHUFPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::SHUFPDrmi:
     DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
-    Src2Name = getRegName(MI->getOperand(2).getReg());
     break;
 
   case X86::SHUFPSrri:
@@ -205,6 +207,31 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeUNPCKHPMask(4, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPERMILPSri:
+    DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPSYri:
+    DecodeVPERMILPSMask(8, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDri:
+    DecodeVPERMILPDMask(2, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERMILPDYri:
+    DecodeVPERMILPDMask(4, MI->getOperand(2).getImm(),
+                        ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VPERM2F128rr:
+    DecodeVPERM2F128Mask(MI->getOperand(3).getImm(), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    break;
   }
 
 
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 506e26c..f9ab5ae 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -32,12 +32,15 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   OS << getRegisterName(RegNo);
 }
 
-void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS) {
+void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                    StringRef Annot) {
   printInstruction(MI, OS);
   
   // If verbose assembly is enabled, we can print some informative comments.
-  if (CommentStream)
+  if (CommentStream) {
+    printAnnotation(OS, Annot);
     EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+  }
 }
 StringRef X86IntelInstPrinter::getOpcodeName(unsigned Opcode) const {
   return getInstructionName(Opcode);
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index e84a194..6d5ec62 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -27,7 +27,7 @@ public:
     : MCInstPrinter(MAI) {}
 
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS);
+  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
   virtual StringRef getOpcodeName(unsigned Opcode) const;
   
   // Autogenerated by tblgen.
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index ca88f8f..8721912 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -1,7 +1,20 @@
 add_llvm_library(LLVMX86Desc
+  X86AsmBackend.cpp
   X86MCTargetDesc.cpp
   X86MCAsmInfo.cpp
+  X86MCCodeEmitter.cpp 
+  X86MachObjectWriter.cpp
   )
 
+add_llvm_library_dependencies(LLVMX86Desc
+  LLVMMC
+  LLVMSupport
+  LLVMX86AsmPrinter
+  LLVMX86AsmPrinter
+  LLVMX86Info
+  )
+
+add_dependencies(LLVMX86Desc X86CommonTableGen)
+
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/X86/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 9b556a5..69ad7d7 100644
--- a/lib/Target/X86/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetAsmBackend.h"
-#include "X86.h"
-#include "X86FixupKinds.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -24,9 +24,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetRegistry.h"
-#include "llvm/Target/TargetAsmBackend.h"
 using namespace llvm;
 
 // Option to allow disabling arithmetic relaxation to workaround PR9807, which
@@ -63,10 +62,10 @@ public:
     : MCELFObjectTargetWriter(is64Bit, OSType, EMachine, HasRelocationAddend) {}
 };
 
-class X86AsmBackend : public TargetAsmBackend {
+class X86AsmBackend : public MCAsmBackend {
 public:
   X86AsmBackend(const Target &T)
-    : TargetAsmBackend() {}
+    : MCAsmBackend() {}
 
   unsigned getNumFixupKinds() const {
     return X86::NumTargetFixupKinds;
@@ -81,7 +80,7 @@ public:
     };
 
     if (Kind < FirstTargetFixupKind)
-      return TargetAsmBackend::getFixupKindInfo(Kind);
+      return MCAsmBackend::getFixupKindInfo(Kind);
 
     assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
            "Invalid kind!");
@@ -94,6 +93,14 @@ public:
 
     assert(Fixup.getOffset() + Size <= DataSize &&
            "Invalid fixup offset!");
+
+    // Check that uppper bits are either all zeros or all ones.
+    // Specifically ignore overflow/underflow as long as the leakage is
+    // limited to the lower bits. This is to remain compatible with
+    // other assemblers.
+    assert(isIntN(Size * 8 + 1, Value) &&
+           "Value does not fit in the Fixup field");
+
     for (unsigned i = 0; i != Size; ++i)
       Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
   }
@@ -426,8 +433,7 @@ public:
 
 } // end anonymous namespace
 
-TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
-                                               const std::string &TT) {
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
@@ -439,8 +445,7 @@ TargetAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
   return new ELFX86_32AsmBackend(T, TheTriple.getOS());
 }
 
-TargetAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
-                                               const std::string &TT) {
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
new file mode 100644
index 0000000..e6ba705
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -0,0 +1,548 @@
+//===-- X86BaseInfo.h - Top level definitions for X86 -------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains small standalone helper functions and enum definitions for
+// the X86 target useful for the compiler back-end and the MC libraries.
+// As such, it deliberately does not include references to LLVM core
+// code gen types, passes, etc..
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86BASEINFO_H
+#define X86BASEINFO_H
+
+#include "X86MCTargetDesc.h"
+#include "llvm/Support/DataTypes.h"
+#include <cassert>
+
+namespace llvm {
+
+namespace X86 {
+  // Enums for memory operand decoding.  Each memory operand is represented with
+  // a 5 operand sequence in the form:
+  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
+  // These enums help decode this.
+  enum {
+    AddrBaseReg = 0,
+    AddrScaleAmt = 1,
+    AddrIndexReg = 2,
+    AddrDisp = 3,
+
+    /// AddrSegmentReg - The operand # of the segment in the memory operand.
+    AddrSegmentReg = 4,
+
+    /// AddrNumOperands - Total number of operands in a memory reference.
+    AddrNumOperands = 5
+  };
+} // end namespace X86;
+ 
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // X86 Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+    /// relocation of:
+    ///    SYMBOL_LABEL + [. - PICBASELABEL]
+    MO_GOT_ABSOLUTE_ADDRESS,
+
+    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+    /// immediate should get the value of the symbol minus the PIC base label:
+    ///    SYMBOL_LABEL - PICBASELABEL
+    MO_PIC_BASE_OFFSET,
+
+    /// MO_GOT - On a symbol operand this indicates that the immediate is the
+    /// offset to the GOT entry for the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOT
+    MO_GOT,
+
+    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTOFF
+    MO_GOTOFF,
+
+    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+    /// offset to the GOT entry for the symbol name from the current code
+    /// location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @GOTPCREL
+    MO_GOTPCREL,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details.
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT,
+
+    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TLSGD
+    MO_TLSGD,
+
+    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @GOTTPOFF
+    MO_GOTTPOFF,
+
+    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @INDNTPOFF
+    MO_INDNTPOFF,
+
+    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @TPOFF
+    MO_TPOFF,
+
+    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details.
+    ///    SYMBOL_LABEL @NTPOFF
+    MO_NTPOFF,
+
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and earlier.
+    MO_DARWIN_STUB,
+
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+
+    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+    /// stub.
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
+
+    /// MO_TLVP - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// This is the TLS offset for the Darwin TLS mechanism.
+    MO_TLVP,
+
+    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
+    /// is some TLS offset from the picbase.
+    ///
+    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
+    MO_TLVP_PIC_BASE
+  };
+
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    // MRMInitReg - This form is used for instructions whose source and
+    // destinations are the same register.
+    MRMInitReg = 32,
+
+    //// MRM_C1 - A mod/rm byte of exactly 0xC1.
+    MRM_C1 = 33,
+    MRM_C2 = 34,
+    MRM_C3 = 35,
+    MRM_C4 = 36,
+    MRM_C8 = 37,
+    MRM_C9 = 38,
+    MRM_E8 = 39,
+    MRM_F0 = 40,
+    MRM_F8 = 41,
+    MRM_F9 = 42,
+    MRM_D0 = 45,
+    MRM_D1 = 46,
+
+    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
+    /// immediates, the first of which is a 16-bit immediate (specified by
+    /// the imm encoding) and the second is a 8-bit fixed value.
+    RawFrmImm8 = 43,
+
+    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
+    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
+    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
+    /// manual, this operand is described as pntr16:32 and pntr16:16
+    RawFrmImm16 = 44,
+
+    FormMask       = 63,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - Set if this instruction requires an operand size prefix (0x66),
+    // which most often indicates that the instruction operates on 16 bit data
+    // instead of 32 bit data.
+    OpSize      = 1 << 6,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
+    // Op0Mask - There are several prefix bytes that are used to form two byte
+    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
+    // used to obtain the setting of this field.  If no bits in this field is
+    // set, there is no prefix byte for obtaining a multibyte opcode.
+    //
+    Op0Shift    = 8,
+    Op0Mask     = 0x1F << Op0Shift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB          = 1 << Op0Shift,
+
+    // REP - The 0xF3 prefix byte indicating repetition of the following
+    // instruction.
+    REP         = 2 << Op0Shift,
+
+    // D8-DF - These escape opcodes are used by the floating point unit.  These
+    // values must remain sequential.
+    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
+    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
+    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
+    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+
+    // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
+    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+    A6 = 15 << Op0Shift,  A7 = 16 << Op0Shift,
+
+    // TF - Prefix before and after 0x0F
+    TF = 17 << Op0Shift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = Op0Shift + 5,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = REXShift + 1,
+    ImmMask    = 7 << ImmShift,
+    Imm8       = 1 << ImmShift,
+    Imm8PCRel  = 2 << ImmShift,
+    Imm16      = 3 << ImmShift,
+    Imm16PCRel = 4 << ImmShift,
+    Imm32      = 5 << ImmShift,
+    Imm32PCRel = 6 << ImmShift,
+    Imm64      = 7 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = ImmShift + 3,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = FPTypeShift + 3,
+    LOCK = 1 << LOCKShift,
+
+    // Segment override prefixes. Currently we just need ability to address
+    // stuff in gs and fs segments.
+    SegOvrShift = LOCKShift + 1,
+    SegOvrMask  = 3 << SegOvrShift,
+    FS          = 1 << SegOvrShift,
+    GS          = 2 << SegOvrShift,
+
+    // Execution domain for SSE instructions in bits 23, 24.
+    // 0 in bits 23-24 means normal, non-SSE instruction.
+    SSEDomainShift = SegOvrShift + 2,
+
+    OpcodeShift   = SSEDomainShift + 2,
+
+    //===------------------------------------------------------------------===//
+    /// VEX - The opcode prefix used by AVX instructions
+    VEXShift = OpcodeShift + 8,
+    VEX         = 1U << 0,
+
+    /// VEX_W - Has a opcode specific functionality, but is used in the same
+    /// way as REX_W is for regular SSE instructions.
+    VEX_W       = 1U << 1,
+
+    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
+    /// address instructions in SSE are represented as 3 address ones in AVX
+    /// and the additional register is encoded in VEX_VVVV prefix.
+    VEX_4V      = 1U << 2,
+
+    /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
+    /// must be encoded in the i8 immediate field. This usually happens in
+    /// instructions with 4 operands.
+    VEX_I8IMM   = 1U << 3,
+
+    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
+    /// instruction uses 256-bit wide registers. This is usually auto detected
+    /// if a VR256 register is used, but some AVX instructions also have this
+    /// field marked when using a f256 memory references.
+    VEX_L       = 1U << 4,
+
+    // VEX_LIG - Specifies that this instruction ignores the L-bit in the VEX
+    // prefix. Usually used for scalar instructions. Needed by disassembler.
+    VEX_LIG     = 1U << 5,
+
+    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
+    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
+    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
+    /// storing a classifier in the imm8 field.  To simplify our implementation,
+    /// we handle this by storeing the classifier in the opcode field and using
+    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
+    Has3DNow0F0FOpcode = 1U << 6
+  };
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
+    return TSFlags >> X86II::OpcodeShift;
+  }
+
+  static inline bool hasImm(uint64_t TSFlags) {
+    return (TSFlags & X86II::ImmMask) != 0;
+  }
+
+  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+  /// of the specified instruction.
+  static inline unsigned getSizeOfImm(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: assert(0 && "Unknown immediate size");
+    case X86II::Imm8:
+    case X86II::Imm8PCRel:  return 1;
+    case X86II::Imm16:
+    case X86II::Imm16PCRel: return 2;
+    case X86II::Imm32:
+    case X86II::Imm32PCRel: return 4;
+    case X86II::Imm64:      return 8;
+    }
+  }
+
+  /// isImmPCRel - Return true if the immediate of the specified instruction's
+  /// TSFlags indicates that it is pc relative.
+  static inline unsigned isImmPCRel(uint64_t TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: assert(0 && "Unknown immediate size");
+    case X86II::Imm8PCRel:
+    case X86II::Imm16PCRel:
+    case X86II::Imm32PCRel:
+      return true;
+    case X86II::Imm8:
+    case X86II::Imm16:
+    case X86II::Imm32:
+    case X86II::Imm64:
+      return false;
+    }
+  }
+
+  /// getMemoryOperandNo - The function returns the MCInst operand # for the
+  /// first field of the memory operand.  If the instruction doesn't have a
+  /// memory operand, this returns -1.
+  ///
+  /// Note that this ignores tied operands.  If there is a tied register which
+  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
+  /// counted as one operand.
+  ///
+  static inline int getMemoryOperandNo(uint64_t TSFlags) {
+    switch (TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:  assert(0 && "FIXME: Remove this form");
+    default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!");
+    case X86II::Pseudo:
+    case X86II::RawFrm:
+    case X86II::AddRegFrm:
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+    case X86II::RawFrmImm8:
+    case X86II::RawFrmImm16:
+       return -1;
+    case X86II::MRMDestMem:
+      return 0;
+    case X86II::MRMSrcMem: {
+      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+      unsigned FirstMemOp = 1;
+      if (HasVEX_4V)
+        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
+
+      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
+      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      return FirstMemOp;
+    }
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      return -1;
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+      return 0;
+    case X86II::MRM_C1:
+    case X86II::MRM_C2:
+    case X86II::MRM_C3:
+    case X86II::MRM_C4:
+    case X86II::MRM_C8:
+    case X86II::MRM_C9:
+    case X86II::MRM_E8:
+    case X86II::MRM_F0:
+    case X86II::MRM_F8:
+    case X86II::MRM_F9:
+    case X86II::MRM_D0:
+    case X86II::MRM_D1:
+      return -1;
+    }
+  }
+
+  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
+  static inline bool isX86_64ExtendedReg(unsigned RegNo) {
+    switch (RegNo) {
+    default: break;
+    case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+    case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+    case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+    case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+    case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+    case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+    case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+    case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
+    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+    case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
+    case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
+        return true;
+    }
+    return false;
+  }
+  
+  static inline bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+            reg == X86::SIL || reg == X86::DIL);
+  }
+}
+
+} // end namespace llvm;
+
+#endif
diff --git a/lib/Target/X86/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 17d242a..17d242a 100644
--- a/lib/Target/X86/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index ce8ef49..2eee112 100644
--- a/lib/Target/X86/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -12,12 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mccodeemitter"
-#include "X86.h"
-#include "X86InstrInfo.h"
-#include "X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,7 +47,7 @@ public:
   }
 
   static unsigned GetX86RegNum(const MCOperand &MO) {
-    return X86RegisterInfo::getX86RegNum(MO.getReg());
+    return X86_MC::getX86RegNum(MO.getReg());
   }
 
   // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
@@ -159,9 +161,11 @@ static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
 static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
   const MCOperand &BaseReg  = MI.getOperand(Op+X86::AddrBaseReg);
   const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
-  
-  if ((BaseReg.getReg() != 0 && X86::GR32RegClass.contains(BaseReg.getReg())) ||
-      (IndexReg.getReg() != 0 && X86::GR32RegClass.contains(IndexReg.getReg())))
+
+  if ((BaseReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
+      (IndexReg.getReg() != 0 &&
+       X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
     return true;
   return false;
 }
@@ -191,11 +195,11 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
               SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
   const MCExpr *Expr = NULL;
   if (DispOp.isImm()) {
-    // If this is a simple integer displacement that doesn't require a relocation,
-    // emit it now.
+    // If this is a simple integer displacement that doesn't require a
+    // relocation, emit it now.
     if (FixupKind != FK_PCRel_1 &&
-	FixupKind != FK_PCRel_2 &&
-	FixupKind != FK_PCRel_4) {
+        FixupKind != FK_PCRel_2 &&
+        FixupKind != FK_PCRel_4) {
       EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
       return;
     }
@@ -205,7 +209,9 @@ EmitImmediate(const MCOperand &DispOp, unsigned Size, MCFixupKind FixupKind,
   }
 
   // If we have an immoffset, add it to the expression.
-  if (FixupKind == FK_Data_4 && StartsWithGlobalOffsetTable(Expr)) {
+  if ((FixupKind == FK_Data_4 ||
+       FixupKind == MCFixupKind(X86::reloc_signed_4byte)) &&
+      StartsWithGlobalOffsetTable(Expr)) {
     assert(ImmOffset == 0);
 
     FixupKind = MCFixupKind(X86::reloc_global_offset_table);
@@ -346,7 +352,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
   }
 
   // Calculate what the SS field value should be...
-  static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
   unsigned SS = SSTable[Scale.getImm()];
 
   if (BaseReg == 0) {
@@ -486,71 +492,100 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       VEX_L = 1;
   }
 
-  unsigned NumOps = MI.getNumOperands();
+  // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned CurOp = 0;
-  bool IsDestMem = false;
-
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
-  case X86II::MRMDestMem:
-    IsDestMem = true;
-    // The important info for the VEX prefix is never beyond the address
-    // registers. Don't check beyond that.
-    NumOps = CurOp = X86::AddrNumOperands;
+  case X86II::MRMDestMem: {
+    // MRMDestMem instructions forms:
+    //  MemAddr, src1(ModR/M)
+    //  MemAddr, src1(VEX_4V), src2(ModR/M)
+    //  MemAddr, src1(ModR/M), imm8
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+
+    CurOp = X86::AddrNumOperands;
+    if (HasVEX_4V)
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+
+    const MCOperand &MO = MI.getOperand(CurOp);
+    if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
+      VEX_R = 0x0;
+    break;
+  }
+  case X86II::MRMSrcMem: {
+    // MRMSrcMem instructions forms:
+    //  src1(ModR/M), MemAddr
+    //  src1(ModR/M), src2(VEX_4V), MemAddr
+    //  src1(ModR/M), MemAddr, imm8
+    //  src1(ModR/M), MemAddr, src2(VEX_I8IMM)
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      VEX_R = 0x0;
+
+    unsigned MemAddrOffset = 1;
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, 1);
+      MemAddrOffset++;
+    }
+
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemAddrOffset+X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(
+               MI.getOperand(MemAddrOffset+X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    break;
+  }
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
   case X86II::MRM6m: case X86II::MRM7m:
-  case X86II::MRMSrcMem:
+    // MRM[0-9]m instructions forms:
+    //  MemAddr
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+      VEX_X = 0x0;
+    break;
   case X86II::MRMSrcReg:
-    if (MI.getNumOperands() > CurOp && MI.getOperand(CurOp).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+    // MRMSrcReg instructions forms:
+    //  dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
+    //  dst(ModR/M), src1(ModR/M)
+    //  dst(ModR/M), src1(ModR/M), imm8
+    //
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
     CurOp++;
 
-    if (HasVEX_4V) {
-      VEX_4V = getVEXRegisterEncoding(MI, IsDestMem ? CurOp-1 : CurOp);
-      CurOp++;
-    }
-
-    // To only check operands before the memory address ones, start
-    // the search from the beginning
-    if (IsDestMem)
-      CurOp = 0;
-
-    // If the last register should be encoded in the immediate field
-    // do not use any bit from VEX prefix to this register, ignore it
-    if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM)
-      NumOps--;
-
-    for (; CurOp != NumOps; ++CurOp) {
-      const MCOperand &MO = MI.getOperand(CurOp);
-      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        VEX_B = 0x0;
-      if (!VEX_B && MO.isReg() &&
-          ((TSFlags & X86II::FormMask) == X86II::MRMSrcMem) &&
-          X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        VEX_X = 0x0;
-    }
-    break;
-  default: // MRMDestReg, MRM0r-MRM7r, RawFrm
-    if (!MI.getNumOperands())
-      break;
-
-    if (MI.getOperand(CurOp).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
-      VEX_B = 0;
-
     if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
-
-    CurOp++;
-    for (; CurOp != NumOps; ++CurOp) {
-      const MCOperand &MO = MI.getOperand(CurOp);
-      if (MO.isReg() && !HasVEX_4V &&
-          X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
-        VEX_R = 0x0;
-    }
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_B = 0x0;
+    break;
+  case X86II::MRMDestReg:
+    // MRMDestReg instructions forms:
+    //  dst(ModR/M), src(ModR/M)
+    //  dst(ModR/M), src(ModR/M), imm8
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      VEX_B = 0x0;
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_R = 0x0;
+    break;
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    // MRM0r-MRM7r instructions forms:
+    //  dst(VEX_4V), src(ModR/M), imm8
+    VEX_4V = getVEXRegisterEncoding(MI, 0);
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_B = 0x0;
+    break;
+  default: // RawFrm
     break;
   }
 
@@ -604,7 +639,7 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
     const MCOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
     unsigned Reg = MO.getReg();
-    if (!X86InstrInfo::isX86_64NonExtLowByteReg(Reg)) continue;
+    if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue;
     // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
     // that returns non-zero.
     REX |= 0x40; // REX fixed encoding prefix
@@ -615,25 +650,25 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
   case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
   case X86II::MRMSrcReg:
     if (MI.getOperand(0).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
       REX |= 1 << 2; // set REX.R
     i = isTwoAddr ? 2 : 1;
     for (; i != NumOps; ++i) {
       const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
         REX |= 1 << 0; // set REX.B
     }
     break;
   case X86II::MRMSrcMem: {
     if (MI.getOperand(0).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
       REX |= 1 << 2; // set REX.R
     unsigned Bit = 0;
     i = isTwoAddr ? 2 : 1;
     for (; i != NumOps; ++i) {
       const MCOperand &MO = MI.getOperand(i);
       if (MO.isReg()) {
-        if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
           REX |= 1 << Bit; // set REX.B (Bit=0) and REX.X (Bit=1)
         Bit++;
       }
@@ -648,13 +683,13 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
     unsigned e = (isTwoAddr ? X86::AddrNumOperands+1 : X86::AddrNumOperands);
     i = isTwoAddr ? 1 : 0;
     if (NumOps > e && MI.getOperand(e).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+        X86II::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
       REX |= 1 << 2; // set REX.R
     unsigned Bit = 0;
     for (; i != e; ++i) {
       const MCOperand &MO = MI.getOperand(i);
       if (MO.isReg()) {
-        if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+        if (X86II::isX86_64ExtendedReg(MO.getReg()))
           REX |= 1 << Bit; // REX.B (Bit=0) and REX.X (Bit=1)
         Bit++;
       }
@@ -663,12 +698,12 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
   }
   default:
     if (MI.getOperand(0).isReg() &&
-        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+        X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
       REX |= 1 << 0; // set REX.B
     i = isTwoAddr ? 2 : 1;
     for (unsigned e = NumOps; i != e; ++i) {
       const MCOperand &MO = MI.getOperand(i);
-      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+      if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
         REX |= 1 << 2; // set REX.R
     }
     break;
@@ -731,7 +766,7 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   if ((TSFlags & X86II::AdSize) ||
       (MemOperand != -1 && is64BitMode() && Is32BitMemOperand(MI, MemOperand)))
     EmitByte(0x67, CurByte, OS);
-  
+
   // Emit the operand size opcode prefix as needed.
   if (TSFlags & X86II::OpSize)
     EmitByte(0x66, CurByte, OS);
@@ -834,7 +869,6 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_4V)
     HasVEX_4V = true;
 
-  
   // Determine where the memory operand starts, if present.
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
   if (MemoryOperand != -1) MemoryOperand += CurOp;
@@ -844,12 +878,11 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   else
     EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
 
-  
   unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
-  
+
   if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
     BaseOpcode = 0x0F;   // Weird 3DNow! encoding.
-  
+
   unsigned SrcRegNum = 0;
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg:
@@ -861,7 +894,6 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::RawFrm:
     EmitByte(BaseOpcode, CurByte, OS);
     break;
-      
   case X86II::RawFrmImm8:
     EmitByte(BaseOpcode, CurByte, OS);
     EmitImmediate(MI.getOperand(CurOp++),
@@ -1006,8 +1038,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     // in bits[7:4] of a immediate byte, and bits[3:0] are ignored.
     if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) {
       const MCOperand &MO = MI.getOperand(CurOp++);
-      bool IsExtReg =
-        X86InstrInfo::isX86_64ExtendedReg(MO.getReg());
+      bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg());
       unsigned RegNum = (IsExtReg ? (1 << 7) : 0);
       RegNum |= GetX86RegNum(MO) << 4;
       EmitImmediate(MCOperand::CreateImm(RegNum), 1, FK_Data_1, CurByte, OS,
@@ -1030,7 +1061,6 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
 
   if ((TSFlags >> X86II::VEXShift) & X86II::Has3DNow0F0FOpcode)
     EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
-  
 
 #ifndef NDEBUG
   // FIXME: Verify.
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index b77f37b..f98d5e3 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -13,12 +13,18 @@
 
 #include "X86MCTargetDesc.h"
 #include "X86MCAsmInfo.h"
+#include "InstPrinter/X86ATTInstPrinter.h"
+#include "InstPrinter/X86IntelInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_REGINFO_MC_DESC
 #include "X86GenRegisterInfo.inc"
@@ -34,9 +40,16 @@ using namespace llvm;
 
 std::string X86_MC::ParseX86Triple(StringRef TT) {
   Triple TheTriple(TT);
+  std::string FS;
   if (TheTriple.getArch() == Triple::x86_64)
-    return "+64bit-mode";
-  return "-64bit-mode";
+    FS = "+64bit-mode";
+  else
+    FS = "-64bit-mode";
+  if (TheTriple.getOS() == Triple::NativeClient)
+    FS += ",+nacl-mode";
+  else
+    FS += ",-nacl-mode";
+  return FS;
 }
 
 /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
@@ -107,6 +120,135 @@ void X86_MC::DetectFamilyModel(unsigned EAX, unsigned &Family,
   }
 }
 
+unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
+  Triple TheTriple(TT);
+  if (TheTriple.getArch() == Triple::x86_64)
+    return DWARFFlavour::X86_64;
+
+  if (TheTriple.isOSDarwin())
+    return isEH ? DWARFFlavour::X86_32_DarwinEH : DWARFFlavour::X86_32_Generic;
+  if (TheTriple.getOS() == Triple::MinGW32 ||
+      TheTriple.getOS() == Triple::Cygwin)
+    // Unsupported by now, just quick fallback
+    return DWARFFlavour::X86_32_Generic;
+  return DWARFFlavour::X86_32_Generic;
+}
+
+/// getX86RegNum - This function maps LLVM register identifiers to their X86
+/// specific numbering, which is used in various places encoding instructions.
+unsigned X86_MC::getX86RegNum(unsigned RegNo) {
+  switch(RegNo) {
+  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+    return N86::ESP;
+  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+    return N86::EBP;
+  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+    return N86::ESI;
+  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+    return N86::EDI;
+
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    return N86::EAX;
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    return N86::ECX;
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    return N86::EDX;
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    return N86::EBX;
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    return N86::ESP;
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    return N86::EBP;
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    return N86::ESI;
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    return N86::EDI;
+
+  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+    return RegNo-X86::ST0;
+
+  case X86::XMM0: case X86::XMM8:
+  case X86::YMM0: case X86::YMM8: case X86::MM0:
+    return 0;
+  case X86::XMM1: case X86::XMM9:
+  case X86::YMM1: case X86::YMM9: case X86::MM1:
+    return 1;
+  case X86::XMM2: case X86::XMM10:
+  case X86::YMM2: case X86::YMM10: case X86::MM2:
+    return 2;
+  case X86::XMM3: case X86::XMM11:
+  case X86::YMM3: case X86::YMM11: case X86::MM3:
+    return 3;
+  case X86::XMM4: case X86::XMM12:
+  case X86::YMM4: case X86::YMM12: case X86::MM4:
+    return 4;
+  case X86::XMM5: case X86::XMM13:
+  case X86::YMM5: case X86::YMM13: case X86::MM5:
+    return 5;
+  case X86::XMM6: case X86::XMM14:
+  case X86::YMM6: case X86::YMM14: case X86::MM6:
+    return 6;
+  case X86::XMM7: case X86::XMM15:
+  case X86::YMM7: case X86::YMM15: case X86::MM7:
+    return 7;
+
+  case X86::ES: return 0;
+  case X86::CS: return 1;
+  case X86::SS: return 2;
+  case X86::DS: return 3;
+  case X86::FS: return 4;
+  case X86::GS: return 5;
+
+  case X86::CR0: case X86::CR8 : case X86::DR0: return 0;
+  case X86::CR1: case X86::CR9 : case X86::DR1: return 1;
+  case X86::CR2: case X86::CR10: case X86::DR2: return 2;
+  case X86::CR3: case X86::CR11: case X86::DR3: return 3;
+  case X86::CR4: case X86::CR12: case X86::DR4: return 4;
+  case X86::CR5: case X86::CR13: case X86::DR5: return 5;
+  case X86::CR6: case X86::CR14: case X86::DR6: return 6;
+  case X86::CR7: case X86::CR15: case X86::DR7: return 7;
+
+  // Pseudo index registers are equivalent to a "none"
+  // scaled index (See Intel Manual 2A, table 2-3)
+  case X86::EIZ:
+  case X86::RIZ:
+    return 4;
+
+  default:
+    assert((int(RegNo) > 0) && "Unknown physical register!");
+    return 0;
+  }
+}
+
+void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
+  // FIXME: TableGen these.
+  for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
+    int SEH = X86_MC::getX86RegNum(Reg);
+    switch (Reg) {
+    case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
+    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
+      SEH += 8;
+      break;
+    }
+    MRI->mapLLVMRegToSEHReg(Reg, SEH);
+  }
+}
+
 MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
                                                   StringRef FS) {
   std::string ArchFS = X86_MC::ParseX86Triple(TT);
@@ -131,55 +273,191 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-// Force static initialization.
-extern "C" void LLVMInitializeX86MCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target,
-                                          X86_MC::createX86MCSubtargetInfo);
-  TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target,
-                                          X86_MC::createX86MCSubtargetInfo);
-}
-
 static MCInstrInfo *createX86MCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitX86MCInstrInfo(X);
   return X;
 }
 
-extern "C" void LLVMInitializeX86MCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo);
-  TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo);
-}
+static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
+  Triple TheTriple(TT);
+  unsigned RA = (TheTriple.getArch() == Triple::x86_64)
+    ? X86::RIP     // Should have dwarf #16.
+    : X86::EIP;    // Should have dwarf #8.
 
-static MCRegisterInfo *createX86MCRegisterInfo() {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitX86MCRegisterInfo(X);
+  InitX86MCRegisterInfo(X, RA,
+                        X86_MC::getDwarfRegFlavour(TT, false),
+                        X86_MC::getDwarfRegFlavour(TT, true));
+  X86_MC::InitLLVM2SEHRegisterMapping(X);
   return X;
 }
 
-extern "C" void LLVMInitializeX86MCRegInfo() {
-  TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo);
-  TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo);
-}
-
-
 static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
   Triple TheTriple(TT);
+  bool is64Bit = TheTriple.getArch() == Triple::x86_64;
 
+  MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
-    if (TheTriple.getArch() == Triple::x86_64)
-      return new X86_64MCAsmInfoDarwin(TheTriple);
+    if (is64Bit)
+      MAI = new X86_64MCAsmInfoDarwin(TheTriple);
     else
-      return new X86MCAsmInfoDarwin(TheTriple);
+      MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.isOSWindows()) {
+    MAI = new X86MCAsmInfoCOFF(TheTriple);
+  } else {
+    MAI = new X86ELFMCAsmInfo(TheTriple);
   }
 
+  // Initialize initial frame state.
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = is64Bit ? -8 : -4;
+
+  // Initial state of the frame pointer is esp+stackGrowth.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  // Add return address to move list
+  MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
+  MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP);
+  MAI->addInitialFrameState(0, CSDst, CSSrc);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                             CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+
+  Triple T(TT);
+  bool is64Bit = T.getArch() == Triple::x86_64;
+
+  if (RM == Reloc::Default) {
+    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
+    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
+    // use static relocation model by default.
+    if (T.isOSDarwin()) {
+      if (is64Bit)
+        RM = Reloc::PIC_;
+      else
+        RM = Reloc::DynamicNoPIC;
+    } else if (T.isOSWindows() && is64Bit)
+      RM = Reloc::PIC_;
+    else
+      RM = Reloc::Static;
+  }
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (RM == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      RM = Reloc::PIC_;
+    else if (!T.isOSDarwin())
+      RM = Reloc::Static;
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (RM == Reloc::Static && T.isOSDarwin() && is64Bit)
+    RM = Reloc::PIC_;
+
+  // For static codegen, if we're not already set, use Small codegen.
+  if (CM == CodeModel::Default)
+    CM = CodeModel::Small;
+  else if (CM == CodeModel::JITDefault)
+    // 64-bit JIT places everything in the same buffer except external funcs.
+    CM = is64Bit ? CodeModel::Large : CodeModel::Small;
+
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
+
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
+    return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
+
   if (TheTriple.isOSWindows())
-    return new X86MCAsmInfoCOFF(TheTriple);
+    return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
+
+  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+}
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(MAI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(MAI);
+  return 0;
+}
 
-  return new X86ELFMCAsmInfo(TheTriple);
+static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
+  return new MCInstrAnalysis(Info);
 }
 
-extern "C" void LLVMInitializeX86MCAsmInfo() {
-  // Register the target asm info.
+// Force static initialization.
+extern "C" void LLVMInitializeX86TargetMC() {
+  // Register the MC asm info.
   RegisterMCAsmInfoFn A(TheX86_32Target, createX86MCAsmInfo);
   RegisterMCAsmInfoFn B(TheX86_64Target, createX86MCAsmInfo);
+
+  // Register the MC codegen info.
+  RegisterMCCodeGenInfoFn C(TheX86_32Target, createX86MCCodeGenInfo);
+  RegisterMCCodeGenInfoFn D(TheX86_64Target, createX86MCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheX86_32Target, createX86MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheX86_64Target, createX86MCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheX86_32Target, createX86MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheX86_64Target, createX86MCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_32Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheX86_64Target,
+                                          X86_MC::createX86MCSubtargetInfo);
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_32Target,
+                                          createX86MCInstrAnalysis);
+  TargetRegistry::RegisterMCInstrAnalysis(TheX86_64Target,
+                                          createX86MCInstrAnalysis);
+
+  // Register the code emitter.
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_32Target,
+                                        createX86MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheX86_64Target,
+                                        createX86MCCodeEmitter);
+
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheX86_32Target,
+                                       createX86_32AsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheX86_64Target,
+                                       createX86_64AsmBackend);
+
+  // Register the object streamer.
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_32Target,
+                                           createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheX86_64Target,
+                                           createMCStreamer);
+
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,
+                                        createX86MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
+                                        createX86MCInstPrinter);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 89ea22b..c144c51 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -14,15 +14,39 @@
 #ifndef X86MCTARGETDESC_H
 #define X86MCTARGETDESC_H
 
+#include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCObjectWriter;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 class Target;
 class StringRef;
+class raw_ostream;
 
 extern Target TheX86_32Target, TheX86_64Target;
 
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+} 
+  
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
 namespace X86_MC {
   std::string ParseX86Triple(StringRef TT);
 
@@ -33,13 +57,32 @@ namespace X86_MC {
 
   void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model);
 
-  /// createARMMCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
+  unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
+
+  unsigned getX86RegNum(unsigned RegNo);
+
+  void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
+
+  /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
   /// This is exposed so Asm parser, etc. do not need to go through
   /// TargetRegistry.
   MCSubtargetInfo *createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
                                             StringRef FS);
 }
 
+MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                      const MCSubtargetInfo &STI,
+                                      MCContext &Ctx);
+
+MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT);
+
+/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
+MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
+                                          bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
+
 } // End llvm namespace
 
 
diff --git a/lib/Target/X86/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 3711038..f0f1982 100644
--- a/lib/Target/X86/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86.h"
-#include "X86FixupKinds.h"
-#include "llvm/ADT/Twine.h"
+#include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Object/MachOFormat.h"
 
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index f16ec02..7d901af 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -862,7 +862,7 @@ define float @bar(float %x) nounwind {
 
 This IR (from PR6194):
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin10.0.0"
 
 %0 = type { double, double }
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 560947a..b407955 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -2,11 +2,6 @@
 // Random ideas for the X86 backend.
 //===---------------------------------------------------------------------===//
 
-We should add support for the "movbe" instruction, which does a byte-swapping
-copy (3-addr bswap + memory support?)  This is available on Atom processors.
-
-//===---------------------------------------------------------------------===//
-
 This should be one DIV/IDIV instruction, not a libcall:
 
 unsigned test(unsigned long long X, unsigned Y) {
@@ -1222,7 +1217,7 @@ Also check why xmm7 is not used at all in the function.
 
 Take the following:
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
 target triple = "i386-apple-darwin8"
 @in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
 define fastcc void @abort_gzip() noreturn nounwind  {
@@ -2066,3 +2061,34 @@ The trick is to match "fetch_and_add(X, -C) == C".
 
 //===---------------------------------------------------------------------===//
 
+unsigned log2(unsigned x) {
+  return x > 1 ? 32-__builtin_clz(x-1) : 0;
+}
+
+generates (x86_64):
+	xorl	%eax, %eax
+	cmpl	$2, %edi
+	jb	LBB0_2
+## BB#1:
+	decl	%edi
+	movl	$63, %ecx
+	bsrl	%edi, %eax
+	cmovel	%ecx, %eax
+	xorl	$-32, %eax
+	addl	$33, %eax
+LBB0_2:
+	ret
+
+The cmov and the early test are redundant:
+	xorl	%eax, %eax
+	cmpl	$2, %edi
+	jb	LBB0_2
+## BB#1:
+	decl	%edi
+	bsrl	%edi, %eax
+	xorl	$-32, %eax
+	addl	$33, %eax
+LBB0_2:
+	ret
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt
index 90be9f5..4da00fa 100644
--- a/lib/Target/X86/TargetInfo/CMakeLists.txt
+++ b/lib/Target/X86/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMX86Info
   X86TargetInfo.cpp
   )
 
-add_dependencies(LLVMX86Info X86CodeGenTable_gen)
+add_llvm_library_dependencies(LLVMX86Info
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMX86Info X86CommonTableGen)
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 08d4d84..52a67f7 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "X86.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheX86_32Target, llvm::TheX86_64Target;
diff --git a/lib/Target/X86/Utils/CMakeLists.txt b/lib/Target/X86/Utils/CMakeLists.txt
index 3ad5f99..caffd8b 100644
--- a/lib/Target/X86/Utils/CMakeLists.txt
+++ b/lib/Target/X86/Utils/CMakeLists.txt
@@ -3,4 +3,10 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMX86Utils
   X86ShuffleDecode.cpp
   )
-add_dependencies(LLVMX86Utils X86CodeGenTable_gen)
+
+add_llvm_library_dependencies(LLVMX86Utils
+  LLVMCore
+  LLVMSupport
+  )
+
+add_dependencies(LLVMX86Utils X86CommonTableGen)
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index cd06060..aeb3309 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -167,24 +167,77 @@ void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
-  // Handle vector lengths > 128 bits.  Define a "section" as a set of
-  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
-  // sections.
-  unsigned NumSections = VT.getSizeInBits() / 128;
-  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
-  unsigned NumSectionElts = NumElts / NumSections;
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
 
   unsigned Start = 0;
-  unsigned End = NumSectionElts / 2;
-  for (unsigned s = 0; s < NumSections; ++s) {
+  unsigned End = NumLaneElts / 2;
+  for (unsigned s = 0; s < NumLanes; ++s) {
     for (unsigned i = Start; i != End; ++i) {
       ShuffleMask.push_back(i);                 // Reads from dest/src1
-      ShuffleMask.push_back(i+NumSectionElts);  // Reads from src/src2
+      ShuffleMask.push_back(i+NumLaneElts);  // Reads from src/src2
     }
     // Process the next 128 bits.
-    Start += NumSectionElts;
-    End += NumSectionElts;
+    Start += NumLaneElts;
+    End += NumLaneElts;
   }
 }
 
+// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes and the mask of the first lane must
+// be the same of the second.
+void DecodeVPERMILPSMask(unsigned NumElts, unsigned Imm,
+                         SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumLanes = (NumElts*32)/128;
+  unsigned LaneSize = NumElts/NumLanes;
+
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      unsigned Idx = (Imm >> (i*2)) & 0x3 ;
+      ShuffleMask.push_back(Idx+(l*LaneSize));
+    }
+  }
+}
+
+// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes but the mask of the first lane can
+// be the different of the second (not like VPERMILPS).
+void DecodeVPERMILPDMask(unsigned NumElts, unsigned Imm,
+                         SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumLanes = (NumElts*64)/128;
+  unsigned LaneSize = NumElts/NumLanes;
+
+  for (unsigned l = 0; l < NumLanes; ++l) {
+    for (unsigned i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+      unsigned Idx = (Imm >> i) & 0x1;
+      ShuffleMask.push_back(Idx+(l*LaneSize));
+    }
+  }
+}
+
+void DecodeVPERM2F128Mask(EVT VT, unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned HalfSize = VT.getVectorNumElements()/2;
+  unsigned FstHalfBegin = (Imm & 0x3) * HalfSize;
+  unsigned SndHalfBegin = ((Imm >> 4) & 0x3) * HalfSize;
+
+  for (int i = FstHalfBegin, e = FstHalfBegin+HalfSize; i != e; ++i)
+    ShuffleMask.push_back(i);
+  for (int i = SndHalfBegin, e = SndHalfBegin+HalfSize; i != e; ++i)
+    ShuffleMask.push_back(i);
+}
+
+void DecodeVPERM2F128Mask(unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask) {
+  // VPERM2F128 is used by any 256-bit EVT, but X86InstComments only
+  // has information about the instruction and not the types. So for
+  // instruction comments purpose, assume the 256-bit vector is v4i64.
+  return DecodeVPERM2F128Mask(MVT::v4i64, Imm, ShuffleMask);
+}
+
 } // llvm namespace
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index b18f670..58193e6 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -82,6 +82,26 @@ void DecodeUNPCKLPDMask(unsigned NElts,
 void DecodeUNPCKLPMask(EVT VT,
                        SmallVectorImpl<unsigned> &ShuffleMask);
 
+
+// DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes and the mask of the first lane must
+// be the same of the second.
+void DecodeVPERMILPSMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+// DecodeVPERMILPDMask - Decodes VPERMILPD permutes for any 128-bit 64-bit
+// elements. For 256-bit vectors, it's considered as two 128 lanes, the
+// referenced elements can't cross lanes but the mask of the first lane can
+// be the different of the second (not like VPERMILPS).
+void DecodeVPERMILPDMask(unsigned NElts, unsigned Imm,
+                        SmallVectorImpl<unsigned> &ShuffleMask);
+
+void DecodeVPERM2F128Mask(unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeVPERM2F128Mask(EVT VT, unsigned Imm,
+                          SmallVectorImpl<unsigned> &ShuffleMask);
+
 } // llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index ec52dfb..81e9422 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -15,6 +15,7 @@
 #ifndef TARGET_X86_H
 #define TARGET_X86_H
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Target/TargetMachine.h"
@@ -24,16 +25,8 @@ namespace llvm {
 class FunctionPass;
 class JITCodeEmitter;
 class MachineCodeEmitter;
-class MCCodeEmitter;
-class MCContext;
-class MCInstrInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
 class Target;
-class TargetAsmBackend;
 class X86TargetMachine;
-class formatted_raw_ostream;
-class raw_ostream;
 
 /// createX86ISelDag - This pass converts a legalized DAG into a 
 /// X86-specific DAG, ready for instruction scheduling.
@@ -51,22 +44,16 @@ FunctionPass* createGlobalBaseRegPass();
 ///
 FunctionPass *createX86FloatingPointStackifierPass();
 
-/// createSSEDomainFixPass - This pass twiddles SSE opcodes to prevent domain
-/// crossings.
-FunctionPass *createSSEDomainFixPass();
+/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
+/// before each call to avoid transition penalty between functions encoded with
+/// AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
 
 /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
 /// to the specified MCE object.
 FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
                                           JITCodeEmitter &JCE);
 
-MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                      const MCSubtargetInfo &STI,
-                                      MCContext &Ctx);
-
-TargetAsmBackend *createX86_32AsmBackend(const Target &, const std::string &);
-TargetAsmBackend *createX86_64AsmBackend(const Target &, const std::string &);
-
 /// createX86EmitCodeToMemory - Returns a pass that converts a register
 /// allocated function into raw machine code in a dynamically
 /// allocated chunk of memory.
@@ -79,13 +66,6 @@ FunctionPass *createEmitX86CodeToMemory();
 ///
 FunctionPass *createX86MaxStackAlignmentHeuristicPass();
 
-
-/// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
-MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
-                                          bool Is64Bit,
-                                          uint32_t CPUType,
-                                          uint32_t CPUSubtype);
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 4ccb43f..104b91f 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a target description file for the Intel i386 architecture, referred to
-// here as the "X86" architecture.
+// This is a target description file for the Intel i386 architecture, referred
+// to here as the "X86" architecture.
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,6 +23,9 @@ include "llvm/Target/Target.td"
 def Mode64Bit : SubtargetFeature<"64bit-mode", "In64BitMode", "true",
                                   "64-bit mode (x86_64)">;
 
+def ModeNaCl  : SubtargetFeature<"nacl-mode", "InNaClMode", "true",
+                                 "Native Client mode">;
+
 //===----------------------------------------------------------------------===//
 // X86 Subtarget features.
 //===----------------------------------------------------------------------===//
@@ -68,6 +71,9 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       "Support 64-bit instructions",
                                       [FeatureCMOV]>;
+def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
+                                      "64-bit with cmpxchg16b",
+                                      [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
 def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
@@ -90,6 +96,16 @@ def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
                  "Allow unaligned memory operands on vector/SIMD instructions">;
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions">;
+def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
+                                      "Support MOVBE instruction">;
+def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
+                                      "Support RDRAND instruction">;
+def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
+                       "Support 16-bit floating point conversion instructions">;
+def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
+                                      "Support LZCNT instruction">;
+def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
+                                      "Support BMI instructions">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -112,27 +128,43 @@ def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
 def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"x86-64",          [FeatureSSE2,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
 def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;
 def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>;
-def : Proc<"nocona",          [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"core2",           [FeatureSSSE3,  Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"penryn",          [FeatureSSE41,  Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"atom",            [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"nocona",          [FeatureSSE3, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"core2",           [FeatureSSSE3, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"atom",            [FeatureSSE3, FeatureCMPXCHG16B, FeatureMOVBE,
+                               FeatureSlowBTMem]>;
 // "Arrandale" along with corei3 and corei5
-def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem, FeatureAES]>;
-def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem]>;
+def : Proc<"corei7",          [FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureFastUAMem, FeatureAES]>;
+def : Proc<"nehalem",         [FeatureSSE42,  FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureFastUAMem]>;
 // Westmere is a similar machine to nehalem with some additional features.
 // Westmere is the corei3/i5/i7 path from nehalem to sandybridge
-def : Proc<"westmere",        [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem, FeatureAES, FeatureCLMUL]>;
+def : Proc<"westmere",        [FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem, FeatureFastUAMem, FeatureAES,
+                               FeatureCLMUL]>;
+// Sandy Bridge
 // SSE is not listed here since llvm treats AVX as a reimplementation of SSE,
 // rather than a superset.
 // FIXME: Disabling AVX for now since it's not ready.
-def : Proc<"corei7-avx",      [FeatureSSE42, Feature64Bit,
+def : Proc<"corei7-avx",      [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureAES, FeatureCLMUL]>;
+// Ivy Bridge
+def : Proc<"core-avx-i",      [FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureAES, FeatureCLMUL,
+                               FeatureRDRAND, FeatureF16C]>;
+
+// Haswell
+def : Proc<"core-avx2",       [FeatureSSE42, FeatureCMPXCHG16B, FeatureAES,
+                               FeatureCLMUL, FeatureRDRAND, FeatureF16C,
+                               FeatureFMA3, FeatureMOVBE, FeatureLZCNT,
+                               FeatureBMI]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
@@ -150,19 +182,21 @@ def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
                                FeatureSlowBTMem]>;
 def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
                                FeatureSlowBTMem]>;
-def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A,
-                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+                               Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
 def : Proc<"barcelona",       [FeatureSSE3,   FeatureSSE4A,
-                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
-def : Proc<"istanbul",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
-                               Feature3DNowA]>;
-def : Proc<"shanghai",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
+                               Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSlowBTMem]>;
+def : Proc<"istanbul",        [Feature3DNowA, FeatureCMPXCHG16B,
+                               FeatureSSE4A, Feature3DNowA]>;
+def : Proc<"shanghai",        [Feature3DNowA, FeatureCMPXCHG16B, FeatureSSE4A,
                                Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 99b4479..4c3ff02 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -35,12 +35,12 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 using namespace llvm;
 
@@ -504,8 +504,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         //   .indirect_symbol _foo
         OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
                                         MCSA_IndirectSymbol);
-        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4 = -12.
-        const char HltInsts[] = { -12, -12, -12, -12, -12 };
+        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
+        const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
         OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/);
       }
 
@@ -708,21 +708,8 @@ void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 // Target Registry Stuff
 //===----------------------------------------------------------------------===//
 
-static MCInstPrinter *createX86MCInstPrinter(const Target &T,
-                                             unsigned SyntaxVariant,
-                                             const MCAsmInfo &MAI) {
-  if (SyntaxVariant == 0)
-    return new X86ATTInstPrinter(MAI);
-  if (SyntaxVariant == 1)
-    return new X86IntelInstPrinter(MAI);
-  return 0;
-}
-
 // Force static initialization.
 extern "C" void LLVMInitializeX86AsmPrinter() {
   RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
   RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
-
-  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,createX86MCInstPrinter);
-  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,createX86MCInstPrinter);
 }
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 4b11db7..aeff03a 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -98,8 +98,6 @@ namespace {
     void emitMemModRMByte(const MachineInstr &MI,
                           unsigned Op, unsigned RegOpcodeField,
                           intptr_t PCAdj = 0);
-
-    unsigned getX86RegNum(unsigned RegNo) const;
   };
 
 template<class CodeEmitter>
@@ -169,7 +167,7 @@ static unsigned determineREX(const MachineInstr &MI) {
       const MachineOperand& MO = MI.getOperand(i);
       if (MO.isReg()) {
         unsigned Reg = MO.getReg();
-        if (X86InstrInfo::isX86_64NonExtLowByteReg(Reg))
+        if (X86II::isX86_64NonExtLowByteReg(Reg))
           REX |= 0x40;
       }
     }
@@ -346,11 +344,6 @@ void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
     MCE.emitWordLE(0);
 }
 
-template<class CodeEmitter>
-unsigned Emitter<CodeEmitter>::getX86RegNum(unsigned RegNo) const {
-  return X86RegisterInfo::getX86RegNum(RegNo);
-}
-
 inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
                                       unsigned RM) {
   assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
@@ -360,7 +353,7 @@ inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
                                             unsigned RegOpcodeFld){
-  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, X86_MC::getX86RegNum(ModRMReg)));
 }
 
 template<class CodeEmitter>
@@ -498,7 +491,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   // 2-7) and absolute references.
   unsigned BaseRegNo = -1U;
   if (BaseReg != 0 && BaseReg != X86::RIP)
-    BaseRegNo = getX86RegNum(BaseReg);
+    BaseRegNo = X86_MC::getX86RegNum(BaseReg);
 
   if (// The SIB byte must be used if there is an index register.
       IndexReg.getReg() == 0 && 
@@ -566,7 +559,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   }
 
   // Calculate what the SS field value should be...
-  static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+  static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
   unsigned SS = SSTable[Scale.getImm()];
 
   if (BaseReg == 0) {
@@ -574,15 +567,15 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
     // Manual 2A, table 2-7. The displacement has already been output.
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg.getReg());
+      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
     else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
       IndexRegNo = 4;
     emitSIBByte(SS, IndexRegNo, 5);
   } else {
-    unsigned BaseRegNo = getX86RegNum(BaseReg);
+    unsigned BaseRegNo = X86_MC::getX86RegNum(BaseReg);
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = getX86RegNum(IndexReg.getReg());
+      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
     else
       IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
     emitSIBByte(SS, IndexRegNo, BaseRegNo);
@@ -809,7 +802,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   }
       
   case X86II::AddRegFrm: {
-    MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+    MCE.emitByte(BaseOpcode +
+                 X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg()));
     
     if (CurOp == NumOps)
       break;
@@ -844,7 +838,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRMDestReg: {
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg()));
     CurOp += 2;
     if (CurOp != NumOps)
       emitConstant(MI.getOperand(CurOp++).getImm(),
@@ -854,7 +848,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRMDestMem: {
     MCE.emitByte(BaseOpcode);
     emitMemModRMByte(MI, CurOp,
-                     getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
+                X86_MC::getX86RegNum(MI.getOperand(CurOp + X86::AddrNumOperands)
                                   .getReg()));
     CurOp +=  X86::AddrNumOperands + 1;
     if (CurOp != NumOps)
@@ -866,7 +860,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRMSrcReg:
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
     CurOp += 2;
     if (CurOp != NumOps)
       emitConstant(MI.getOperand(CurOp++).getImm(),
@@ -880,8 +874,8 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       X86II::getSizeOfImm(Desc->TSFlags) : 0;
 
     MCE.emitByte(BaseOpcode);
-    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
-                     PCAdj);
+    emitMemModRMByte(MI, CurOp+1,
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
     CurOp += AddrOperands + 1;
     if (CurOp != NumOps)
       emitConstant(MI.getOperand(CurOp++).getImm(),
@@ -968,7 +962,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     MCE.emitByte(BaseOpcode);
     // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
     ++CurOp;
     break;
       
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
index f1d7ede..4a72d15 100644
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -147,7 +147,7 @@ long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
   if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32)
     return SymOffset - (RelOffset + 4);
   else
-    assert("computeRelocation unknown for this relocation type");
+    assert(0 && "computeRelocation unknown for this relocation type");
 
   return 0;
 }
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 21e163a..f912b28 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CallingConv.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
@@ -59,8 +60,8 @@ public:
   explicit X86FastISel(FunctionLoweringInfo &funcInfo) : FastISel(funcInfo) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
     StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
-    X86ScalarSSEf64 = Subtarget->hasSSE2();
-    X86ScalarSSEf32 = Subtarget->hasSSE1();
+    X86ScalarSSEf64 = Subtarget->hasSSE2() || Subtarget->hasAVX();
+    X86ScalarSSEf32 = Subtarget->hasSSE1() || Subtarget->hasAVX();
   }
 
   virtual bool TargetSelectInstruction(const Instruction *I);
@@ -134,7 +135,7 @@ private:
       (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
   }
 
-  bool isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1 = false);
+  bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
 
   bool IsMemcpySmall(uint64_t Len);
 
@@ -144,7 +145,7 @@ private:
 
 } // end anonymous namespace.
 
-bool X86FastISel::isTypeLegal(const Type *Ty, MVT &VT, bool AllowI1) {
+bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
   EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
   if (evt == MVT::Other || !evt.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
@@ -198,8 +199,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
     RC  = X86::GR64RegisterClass;
     break;
   case MVT::f32:
-    if (Subtarget->hasSSE1()) {
-      Opc = X86::MOVSSrm;
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
       RC  = X86::FR32RegisterClass;
     } else {
       Opc = X86::LD_Fp32m;
@@ -207,8 +208,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
     }
     break;
   case MVT::f64:
-    if (Subtarget->hasSSE2()) {
-      Opc = X86::MOVSDrm;
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
       RC  = X86::FR64RegisterClass;
     } else {
       Opc = X86::LD_Fp64m;
@@ -250,10 +251,12 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
   case MVT::i32: Opc = X86::MOV32mr; break;
   case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
   case MVT::f32:
-    Opc = Subtarget->hasSSE1() ? X86::MOVSSmr : X86::ST_Fp32m;
+    Opc = X86ScalarSSEf32 ?
+          (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
     break;
   case MVT::f64:
-    Opc = Subtarget->hasSSE2() ? X86::MOVSDmr : X86::ST_Fp64m;
+    Opc = X86ScalarSSEf64 ?
+          (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
     break;
   }
 
@@ -336,7 +339,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     U = C;
   }
 
-  if (const PointerType *Ty = dyn_cast<PointerType>(V->getType()))
+  if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
     if (Ty->getAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
@@ -399,7 +402,7 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
     for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
          i != e; ++i, ++GTI) {
       const Value *Op = *i;
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = TD.getStructLayout(STy);
         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
         continue;
@@ -465,14 +468,23 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
   // Handle constant address.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    // Can't handle alternate code models or TLS yet.
+    // Can't handle alternate code models yet.
     if (TM.getCodeModel() != CodeModel::Small)
       return false;
 
+    // Can't handle TLS yet.
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
       if (GVar->isThreadLocal())
         return false;
 
+    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
+    // it works...).
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (const GlobalVariable *GVar =
+            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+        if (GVar->isThreadLocal())
+          return false;
+
     // RIP-relative addresses can't have additional register operands, so if
     // we've already folded stuff into the addressing mode, just force the
     // global value into its own register, which we can use as the basereg.
@@ -658,6 +670,10 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
 
 /// X86SelectStore - Select and emit code to implement store instructions.
 bool X86FastISel::X86SelectStore(const Instruction *I) {
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
   MVT VT;
   if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
     return false;
@@ -780,6 +796,10 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
 /// X86SelectLoad - Select and emit code to implement load instructions.
 ///
 bool X86FastISel::X86SelectLoad(const Instruction *I)  {
+  // Atomic loads need special handling.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
   MVT VT;
   if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
     return false;
@@ -797,14 +817,20 @@ bool X86FastISel::X86SelectLoad(const Instruction *I)  {
 }
 
 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+  bool HasAVX = Subtarget->hasAVX();
+  bool X86ScalarSSEf32 = HasAVX || Subtarget->hasSSE1();
+  bool X86ScalarSSEf64 = HasAVX || Subtarget->hasSSE2();
+
   switch (VT.getSimpleVT().SimpleTy) {
   default:       return 0;
   case MVT::i8:  return X86::CMP8rr;
   case MVT::i16: return X86::CMP16rr;
   case MVT::i32: return X86::CMP32rr;
   case MVT::i64: return X86::CMP64rr;
-  case MVT::f32: return Subtarget->hasSSE1() ? X86::UCOMISSrr : 0;
-  case MVT::f64: return Subtarget->hasSSE2() ? X86::UCOMISDrr : 0;
+  case MVT::f32:
+    return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+  case MVT::f64:
+    return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
   }
 }
 
@@ -1207,7 +1233,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
 
 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
   // fpext from float to double.
-  if (Subtarget->hasSSE2() &&
+  if (X86ScalarSSEf64 &&
       I->getType()->isDoubleTy()) {
     const Value *V = I->getOperand(0);
     if (V->getType()->isFloatTy()) {
@@ -1226,7 +1252,7 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
-  if (Subtarget->hasSSE2()) {
+  if (X86ScalarSSEf64) {
     if (I->getType()->isFloatTy()) {
       const Value *V = I->getOperand(0);
       if (V->getType()->isDoubleTy()) {
@@ -1365,6 +1391,9 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
   case Intrinsic::memset: {
     const MemSetInst &MSI = cast<MemSetInst>(I);
 
+    if (MSI.isVolatile())
+      return false;
+
     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
     if (!MSI.getLength()->getType()->isIntegerTy(SizeWidth))
       return false;
@@ -1411,7 +1440,7 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
     // Replace "add with overflow" intrinsics with an "add" instruction followed
     // by a seto/setc instruction.
     const Function *Callee = I.getCalledFunction();
-    const Type *RetTy =
+    Type *RetTy =
       cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
 
     MVT VT;
@@ -1484,8 +1513,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
     return false;
 
-  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
-  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
   bool isVarArg = FTy->isVarArg();
 
   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
@@ -1547,8 +1576,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       Flags.setZExt();
 
     if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
-      const PointerType *Ty = cast<PointerType>(ArgVal->getType());
-      const Type *ElementTy = Ty->getElementType();
+      PointerType *Ty = cast<PointerType>(ArgVal->getType());
+      Type *ElementTy = Ty->getElementType();
       unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
       unsigned FrameAlign = CS.getParamAlignment(AttrInd);
       if (!FrameAlign)
@@ -1600,7 +1629,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
     if (ArgReg == 0) return false;
 
-    const Type *ArgTy = ArgVal->getType();
+    Type *ArgTy = ArgVal->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT))
       return false;
@@ -1709,7 +1738,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
         assert(Res && "memcpy length already checked!"); (void)Res;
       } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
         // If this is a really simple value, emit this with the Value* version
-        //of X86FastEmitStore.  If it isn't simple, we don't want to do this,
+        // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
         // as it can cause us to reevaluate the argument.
         X86FastEmitStore(ArgVT, ArgVal, AM);
       } else {
@@ -1965,8 +1994,8 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     RC  = X86::GR64RegisterClass;
     break;
   case MVT::f32:
-    if (Subtarget->hasSSE1()) {
-      Opc = X86::MOVSSrm;
+    if (X86ScalarSSEf32) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
       RC  = X86::FR32RegisterClass;
     } else {
       Opc = X86::LD_Fp32m;
@@ -1974,8 +2003,8 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     }
     break;
   case MVT::f64:
-    if (Subtarget->hasSSE2()) {
-      Opc = X86::MOVSDrm;
+    if (X86ScalarSSEf64) {
+      Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
       RC  = X86::FR64RegisterClass;
     } else {
       Opc = X86::LD_Fp64m;
@@ -2070,8 +2099,8 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
   switch (VT.SimpleTy) {
     default: return false;
     case MVT::f32:
-      if (Subtarget->hasSSE1()) {
-        Opc = X86::FsFLD0SS;
+      if (X86ScalarSSEf32) {
+        Opc = Subtarget->hasAVX() ? X86::VFsFLD0SS : X86::FsFLD0SS;
         RC  = X86::FR32RegisterClass;
       } else {
         Opc = X86::LD_Fp032;
@@ -2079,8 +2108,8 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
       }
       break;
     case MVT::f64:
-      if (Subtarget->hasSSE2()) {
-        Opc = X86::FsFLD0SD;
+      if (X86ScalarSSEf64) {
+        Opc = Subtarget->hasAVX() ? X86::VFsFLD0SD : X86::FsFLD0SD;
         RC  = X86::FR64RegisterClass;
       } else {
         Opc = X86::LD_Fp064;
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 6eed6abd..e3461c8 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -260,6 +260,21 @@ namespace {
       BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
     }
 
+    /// duplicatePendingSTBeforeKill - The instruction at I is about to kill
+    /// RegNo. If any PendingST registers still need the RegNo value, duplicate
+    /// them to new scratch registers.
+    void duplicatePendingSTBeforeKill(unsigned RegNo, MachineInstr *I) {
+      for (unsigned i = 0; i != NumPendingSTs; ++i) {
+        if (PendingST[i] != RegNo)
+          continue;
+        unsigned SR = getScratchReg();
+        DEBUG(dbgs() << "Duplicating pending ST" << i
+                     << " in FP" << RegNo << " to FP" << SR << '\n');
+        duplicateToTop(RegNo, SR, I);
+        PendingST[i] = SR;
+      }
+    }
+
     /// popStackAfter - Pop the current value off of the top of the FP stack
     /// after the specified instruction.
     void popStackAfter(MachineBasicBlock::iterator &I);
@@ -406,6 +421,10 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     if (MI->isCopy() && isFPCopy(MI))
       FPInstClass = X86II::SpecialFP;
 
+    if (MI->isImplicitDef() &&
+        X86::RFP80RegClass.contains(MI->getOperand(0).getReg()))
+      FPInstClass = X86II::SpecialFP;
+
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
@@ -461,6 +480,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
       }
       dumpStack();
     );
+    (void)PrevMI;
 
     Changed = true;
   }
@@ -969,6 +989,9 @@ void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
   unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
+  if (KillsSrc)
+    duplicatePendingSTBeforeKill(Reg, I);
+
   // FISTP64m is strange because there isn't a non-popping versions.
   // If we have one _and_ we don't want to pop the operand, duplicate the value
   // on the stack instead of moving it.  This ensure that popping the value is
@@ -1032,6 +1055,7 @@ void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
   bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
 
   if (KillsSrc) {
+    duplicatePendingSTBeforeKill(Reg, I);
     // If this is the last use of the source register, just make sure it's on
     // the top of the stack.
     moveToTop(Reg, I);
@@ -1318,6 +1342,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
 
       // When the source is killed, allocate a scratch FP register.
       if (KillsSrc) {
+        duplicatePendingSTBeforeKill(SrcFP, I);
         unsigned Slot = getSlot(SrcFP);
         unsigned SR = getScratchReg();
         PendingST[DstST] = SR;
@@ -1369,6 +1394,15 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     break;
   }
 
+  case TargetOpcode::IMPLICIT_DEF: {
+    // All FP registers must be explicitly defined, so load a 0 instead.
+    unsigned Reg = MI->getOperand(0).getReg() - X86::FP0;
+    DEBUG(dbgs() << "Emitting LD_F0 for implicit FP" << Reg << '\n');
+    BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::LD_F0));
+    pushReg(Reg);
+    break;
+  }
+
   case X86::FpPOP_RETVAL: {
     // The FpPOP_RETVAL instruction is used after calls that return a value on
     // the floating point stack. We cannot model this with ST defs since CALL
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index ed45a9a..d54f4ae 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -15,6 +15,7 @@
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -91,12 +92,12 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
     return 0;
 
   static const unsigned CallerSavedRegs32Bit[] = {
-    X86::EAX, X86::EDX, X86::ECX
+    X86::EAX, X86::EDX, X86::ECX, 0
   };
 
   static const unsigned CallerSavedRegs64Bit[] = {
     X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
-    X86::R8,  X86::R9,  X86::R10, X86::R11
+    X86::R8,  X86::R9,  X86::R10, X86::R11, 0
   };
 
   unsigned Opc = MBBI->getOpcode();
@@ -283,8 +284,8 @@ static bool isEAXLiveIn(MachineFunction &MF) {
 }
 
 void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
-                                             MCSymbol *Label,
-                                             unsigned FramePtr) const {
+                                                 MCSymbol *Label,
+                                                 unsigned FramePtr) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
 
@@ -346,6 +347,247 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   }
 }
 
+/// getCompactUnwindRegNum - Get the compact unwind number for a given
+/// register. The number corresponds to the enum lists in
+/// compact_unwind_encoding.h.
+static int getCompactUnwindRegNum(const unsigned *CURegs, unsigned Reg) {
+  int Idx = 1;
+  for (; *CURegs; ++CURegs, ++Idx)
+    if (*CURegs == Reg)
+      return Idx;
+
+  return -1;
+}
+
+/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
+/// used with frameless stacks. It is passed the number of registers to be saved
+/// and an array of the registers saved.
+static uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[6],
+                                                         unsigned RegCount,
+                                                         bool Is64Bit) {
+  // The saved registers are numbered from 1 to 6. In order to encode the order
+  // in which they were saved, we re-number them according to their place in the
+  // register order. The re-numbering is relative to the last re-numbered
+  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
+  //
+  //    Orig  Re-Num
+  //    ----  ------
+  //     6       6
+  //     2       2
+  //     4       3
+  //     5       3
+  //
+  static const unsigned CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const unsigned CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+
+  uint32_t RenumRegs[6];
+  for (unsigned i = 6 - RegCount; i < 6; ++i) {
+    int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
+    if (CUReg == -1) return ~0U;
+    SavedRegs[i] = CUReg;
+
+    unsigned Countless = 0;
+    for (unsigned j = 6 - RegCount; j < i; ++j)
+      if (SavedRegs[j] < SavedRegs[i])
+        ++Countless;
+
+    RenumRegs[i] = SavedRegs[i] - Countless - 1;
+  }
+
+  // Take the renumbered values and encode them into a 10-bit number.
+  uint32_t permutationEncoding = 0;
+  switch (RegCount) {
+  case 6:
+    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                           +     RenumRegs[4];
+    break;
+  case 5:
+    permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                           + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                           +     RenumRegs[5];
+    break;
+  case 4:
+    permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                           + 3 * RenumRegs[4] +      RenumRegs[5];
+    break;
+  case 3:
+    permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                           +     RenumRegs[5];
+    break;
+  case 2:
+    permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+    break;
+  case 1:
+    permutationEncoding |=       RenumRegs[5];
+    break;
+  }
+
+  assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+         "Invalid compact register encoding!");
+  return permutationEncoding;
+}
+
+/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
+/// compact encoding with a frame pointer.
+static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[6],
+                                                      bool Is64Bit) {
+  static const unsigned CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const unsigned CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const unsigned *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
+
+  // Encode the registers in the order they were saved, 3-bits per register. The
+  // registers are numbered from 1 to 6.
+  uint32_t RegEnc = 0;
+  for (int I = 5; I >= 0; --I) {
+    unsigned Reg = SavedRegs[I];
+    if (Reg == 0) break;
+    int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
+    if (CURegNum == -1)
+      return ~0U;
+    RegEnc |= (CURegNum & 0x7) << (5 - I);
+  }
+
+  assert((RegEnc & 0x7FFF) == RegEnc && "Invalid compact register encoding!");
+  return RegEnc;
+}
+
+uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
+  unsigned FramePtr = RegInfo->getFrameRegister(MF);
+  unsigned StackPtr = RegInfo->getStackRegister();
+
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  bool Is64Bit = STI.is64Bit();
+  bool HasFP = hasFP(MF);
+
+  unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 };
+  int SavedRegIdx = 6;
+
+  unsigned OffsetSize = (Is64Bit ? 8 : 4);
+
+  unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r);
+  unsigned PushInstrSize = 1;
+  unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+  unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
+  unsigned SubtractInstr = getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta);
+  unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
+
+  unsigned StackDivide = (Is64Bit ? 8 : 4);
+
+  unsigned InstrOffset = 0;
+  unsigned CFAOffset = 0;
+  unsigned StackAdjust = 0;
+
+  MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
+  bool ExpectEnd = false;
+  for (MachineBasicBlock::iterator
+         MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    unsigned Opc = MI.getOpcode();
+    if (Opc == X86::PROLOG_LABEL) continue;
+    if (!MI.getFlag(MachineInstr::FrameSetup)) break;
+
+    // We don't exect any more prolog instructions.
+    if (ExpectEnd) return 0;
+
+    if (Opc == PushInstr) {
+      // If there are too many saved registers, we cannot use compact encoding.
+      if (--SavedRegIdx < 0) return 0;
+
+      SavedRegs[SavedRegIdx] = MI.getOperand(0).getReg();
+      CFAOffset += OffsetSize;
+      InstrOffset += PushInstrSize;
+    } else if (Opc == MoveInstr) {
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      unsigned DstReg = MI.getOperand(0).getReg();
+
+      if (DstReg != FramePtr || SrcReg != StackPtr)
+        return 0;
+
+      CFAOffset = 0;
+      memset(SavedRegs, 0, sizeof(SavedRegs));
+      InstrOffset += MoveInstrSize;
+    } else if (Opc == SubtractInstr) {
+      if (StackAdjust)
+        // We all ready have a stack pointer adjustment.
+        return 0;
+
+      if (!MI.getOperand(0).isReg() ||
+          MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
+          MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm())
+        // We need this to be a stack adjustment pointer. Something like:
+        //
+        //   %RSP<def> = SUB64ri8 %RSP, 48
+        return 0;
+
+      StackAdjust = MI.getOperand(2).getImm() / StackDivide;
+      SubtractInstrIdx += InstrOffset;
+      ExpectEnd = true;
+    }
+  }
+
+  // Encode that we are using EBP/RBP as the frame pointer.
+  uint32_t CompactUnwindEncoding = 0;
+  CFAOffset /= StackDivide;
+  if (HasFP) {
+    if ((CFAOffset & 0xFF) != CFAOffset)
+      // Offset was too big for compact encoding.
+      return 0;
+
+    // Get the encoding of the saved registers when we have a frame pointer.
+    uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
+    if (RegEnc == ~0U)
+      return 0;
+
+    CompactUnwindEncoding |= 0x01000000;
+    CompactUnwindEncoding |= (CFAOffset & 0xFF) << 16;
+    CompactUnwindEncoding |= RegEnc & 0x7FFF;
+  } else {
+    unsigned FullOffset = CFAOffset + StackAdjust;
+    if ((FullOffset & 0xFF) == FullOffset) {
+      // Frameless stack.
+      CompactUnwindEncoding |= 0x02000000;
+      CompactUnwindEncoding |= (FullOffset & 0xFF) << 16;
+    } else {
+      if ((CFAOffset & 0x7) != CFAOffset)
+        // The extra stack adjustments are too big for us to handle.
+        return 0;
+
+      // Frameless stack with an offset too large for us to encode compactly.
+      CompactUnwindEncoding |= 0x03000000;
+
+      // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+      // instruction.
+      CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+      // Encode any extra stack stack changes (done via push instructions).
+      CompactUnwindEncoding |= (CFAOffset & 0x7) << 13;
+    }
+
+    // Get the encoding of the saved registers when we don't have a frame
+    // pointer.
+    uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegs,
+                                                               6 - SavedRegIdx,
+                                                               Is64Bit);
+    if (RegEnc == ~0U) return 0;
+    CompactUnwindEncoding |= RegEnc & 0x3FF;
+  }
+
+  return CompactUnwindEncoding;
+}
+
 /// emitPrologue - Push callee-saved registers onto the stack, which
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
@@ -370,7 +612,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned StackPtr = RegInfo->getStackRegister();
-
   DebugLoc DL;
 
   // If we're forcing a stack realignment we can't rely on just the frame
@@ -398,7 +639,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       !RegInfo->needsStackRealignment(MF) &&
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
-      !IsWin64) {                                  // Win64 has no Red Zone
+      !IsWin64 &&                                  // Win64 has no Red Zone
+      !EnableSegmentedStacks) {                    // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -459,7 +701,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (needsFrameMoves) {
       // Mark the place where EBP/RBP was saved.
       MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+        .addSym(FrameLabel);
 
       // Define the current CFA rule to use the provided offset.
       if (StackSize) {
@@ -478,7 +721,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
     }
 
-    // Update EBP with the new base value...
+    // Update EBP with the new base value.
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
         .addReg(StackPtr)
@@ -487,7 +730,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (needsFrameMoves) {
       // Mark effective beginning of when frame pointer becomes valid.
       MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(FrameLabel);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+        .addSym(FrameLabel);
 
       // Define the current CFA to use the EBP/RBP register.
       MachineLocation FPDst(FramePtr);
@@ -504,8 +748,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (RegInfo->needsStackRealignment(MF)) {
       MachineInstr *MI =
         BuildMI(MBB, MBBI, DL,
-                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
-                StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri), StackPtr)
+        .addReg(StackPtr)
+        .addImm(-MaxAlign)
+        .setMIFlag(MachineInstr::FrameSetup);
 
       // The EFLAGS implicit def is dead.
       MI->getOperand(3).setIsDead();
@@ -522,6 +768,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
          (MBBI->getOpcode() == X86::PUSH32r ||
           MBBI->getOpcode() == X86::PUSH64r)) {
     PushedRegs = true;
+    MBBI->setFlag(MachineInstr::FrameSetup);
     ++MBBI;
 
     if (!HasFP && needsFrameMoves) {
@@ -530,8 +777,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
 
       // Define the current CFA rule to use the provided offset.
-      unsigned Ptr = StackSize ?
-        MachineLocation::VirtualFP : StackPtr;
+      unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr;
       MachineLocation SPDst(Ptr);
       MachineLocation SPSrc(Ptr, StackOffset);
       Moves.push_back(MachineMove(Label, SPDst, SPSrc));
@@ -586,26 +832,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // Save EAX
       BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
-        .addReg(X86::EAX, RegState::Kill);
+        .addReg(X86::EAX, RegState::Kill)
+        .setMIFlag(MachineInstr::FrameSetup);
     }
 
     if (Is64Bit) {
       // Handle the 64-bit Windows ABI case where we need to call __chkstk.
       // Function prologue is responsible for adjusting the stack pointer.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
-        .addImm(NumBytes);
+        .addImm(NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
     } else {
       // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
       // We'll also use 4 already allocated bytes for EAX.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes);
+        .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
+        .setMIFlag(MachineInstr::FrameSetup);
     }
 
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::W64ALLOCA : X86::CALLpcrel32))
       .addExternalSymbol(StackProbeSymbol)
       .addReg(StackPtr,    RegState::Define | RegState::Implicit)
-      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
+      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
+      .setMIFlag(MachineInstr::FrameSetup);
 
     // MSVC x64's __chkstk needs to adjust %rsp.
     // FIXME: %rax preserves the offset and should be available.
@@ -618,6 +868,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
                                                 X86::EAX),
                                         StackPtr, false, NumBytes - 4);
+        MI->setFlag(MachineInstr::FrameSetup);
         MBB.insert(MBBI, MI);
     }
   } else if (NumBytes)
@@ -627,7 +878,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (( (!HasFP && NumBytes) || PushedRegs) && needsFrameMoves) {
     // Mark end of stack pointer adjustment.
     MCSymbol *Label = MMI.getContext().CreateTempSymbol();
-    BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
+    BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL))
+      .addSym(Label);
 
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
@@ -647,6 +899,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     if (PushedRegs)
       emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
   }
+
+  // Darwin 10.7 and greater has support for compact unwind encoding.
+  if (STI.getTargetTriple().isMacOSX() &&
+      !STI.getTargetTriple().isMacOSXVersionLT(10, 7))
+    MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF));
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -844,23 +1101,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void
-X86FrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves) const {
-  // Calculate amount of bytes used for return address storing
-  int stackGrowth = (STI.is64Bit() ? -8 : -4);
-  const X86RegisterInfo *RI = TM.getRegisterInfo();
-
-  // Initial state of the frame pointer is esp+stackGrowth.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(RI->getStackRegister(), stackGrowth);
-  Moves.push_back(MachineMove(0, Dst, Src));
-
-  // Add return address to move list
-  MachineLocation CSDst(RI->getStackRegister(), stackGrowth);
-  MachineLocation CSSrc(RI->getRARegister());
-  Moves.push_back(MachineMove(0, CSDst, CSSrc));
-}
-
 int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
   const X86RegisterInfo *RI =
     static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
@@ -873,9 +1113,7 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) con
       // Skip the saved EBP.
       Offset += RI->getSlotSize();
     } else {
-      unsigned Align = MFI->getObjectAlignment(FI);
-      assert((-(Offset + StackSize)) % Align == 0);
-      Align = 0;
+      assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
     }
     // FIXME: Support tail calls
@@ -1027,184 +1265,183 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                           true);
     assert(FrameIdx == MFI->getObjectIndexBegin() &&
            "Slot for EBP register must be last in order to be found!");
-    FrameIdx = 0;
+    (void)FrameIdx;
   }
 }
 
-/// permuteEncode - Create the permutation encoding used with frameless
-/// stacks. It is passed the number of registers to be saved and an array of the
-/// registers saved.
-static uint32_t permuteEncode(unsigned SavedCount, unsigned Registers[6]) {
-  // The saved registers are numbered from 1 to 6. In order to encode the order
-  // in which they were saved, we re-number them according to their place in the
-  // register order. The re-numbering is relative to the last re-numbered
-  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
-  //
-  //    Orig  Re-Num
-  //    ----  ------
-  //     6       6
-  //     2       2
-  //     4       3
-  //     5       3
-  //
-  bool Used[7] = { false, false, false, false, false, false, false };
-  uint32_t RenumRegs[6];
-  for (unsigned I = 0; I < SavedCount; ++I) {
-    uint32_t Renum = 0;
-    for (unsigned U = 1; U < 7; ++U) {
-      if (U == Registers[I])
-        break;
-      if (!Used[U])
-        ++Renum;
-    }
-
-    Used[Registers[I]] = true;
-    RenumRegs[I] = Renum;
+static bool
+HasNestArgument(const MachineFunction *MF) {
+  const Function *F = MF->getFunction();
+  for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; I++) {
+    if (I->hasNestAttr())
+      return true;
   }
+  return false;
+}
 
-  // Take the renumbered values and encode them into a 10-bit number.
-  uint32_t permutationEncoding = 0;
-  switch (SavedCount) {
-  case 6:
-    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                           +     RenumRegs[4];
-    break;
-  case 5:
-    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                           +     RenumRegs[4];
-    break;
-  case 4:
-    permutationEncoding |= 60 * RenumRegs[0] + 12 * RenumRegs[1]
-                          + 3 * RenumRegs[2] +      RenumRegs[3];
-    break;
-  case 3:
-    permutationEncoding |= 20 * RenumRegs[0] + 4 * RenumRegs[1]
-                              + RenumRegs[2];
-    break;
-  case 2:
-    permutationEncoding |=  5 * RenumRegs[0] +     RenumRegs[1];
-    break;
-  case 1:
-    permutationEncoding |=      RenumRegs[0];
-    break;
+static unsigned
+GetScratchRegister(bool Is64Bit, const MachineFunction &MF) {
+  if (Is64Bit) {
+    return X86::R11;
+  } else {
+    CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+    bool IsNested = HasNestArgument(&MF);
+
+    if (CallingConvention == CallingConv::X86_FastCall) {
+      if (IsNested) {
+        report_fatal_error("Segmented stacks does not support fastcall with "
+                           "nested function.");
+        return -1;
+      } else {
+        return X86::EAX;
+      }
+    } else {
+      if (IsNested)
+        return X86::EDX;
+      else
+        return X86::ECX;
+    }
   }
-
-  return permutationEncoding;
 }
 
-uint32_t X86FrameLowering::
-getCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs,
-                         int DataAlignmentFactor, bool IsEH) const {
-  uint32_t Encoding = 0;
-  int CFAOffset = 0;
-  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  unsigned SavedRegs[6] = { 0, 0, 0, 0, 0, 0 };
-  unsigned SavedRegIdx = 0;
-  int FramePointerReg = -1;
+void
+X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
+  MachineBasicBlock &prologueMBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const X86InstrInfo &TII = *TM.getInstrInfo();
+  uint64_t StackSize;
+  bool Is64Bit = STI.is64Bit();
+  unsigned TlsReg, TlsOffset;
+  DebugLoc DL;
+  const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>();
 
-  for (ArrayRef<MCCFIInstruction>::const_iterator
-         I = Instrs.begin(), E = Instrs.end(); I != E; ++I) {
-    const MCCFIInstruction &Inst = *I;
-    MCSymbol *Label = Inst.getLabel();
+  unsigned ScratchReg = GetScratchRegister(Is64Bit, MF);
+  assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+         "Scratch register is live-in");
 
-    // Ignore invalid labels.
-    if (Label && !Label->isDefined()) continue;
+  if (MF.getFunction()->isVarArg())
+    report_fatal_error("Segmented stacks do not support vararg functions.");
+  if (!ST->isTargetLinux())
+    report_fatal_error("Segmented stacks supported only on linux.");
 
-    unsigned Operation = Inst.getOperation();
-    if (Operation != MCCFIInstruction::Move &&
-        Operation != MCCFIInstruction::RelMove)
-      // FIXME: We can't handle this frame just yet.
-      return 0;
-
-    const MachineLocation &Dst = Inst.getDestination();
-    const MachineLocation &Src = Inst.getSource();
-    const bool IsRelative = (Operation == MCCFIInstruction::RelMove);
-
-    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      if (Src.getReg() != MachineLocation::VirtualFP) {
-        // DW_CFA_def_cfa
-        assert(FramePointerReg == -1 &&"Defining more than one frame pointer?");
-        if (TRI->getLLVMRegNum(Src.getReg(), IsEH) != X86::EBP &&
-            TRI->getLLVMRegNum(Src.getReg(), IsEH) != X86::RBP)
-          // The frame pointer isn't EBP/RBP. Cannot make unwind information
-          // compact.
-          return 0;
-        FramePointerReg = TRI->getCompactUnwindRegNum(Src.getReg(), IsEH);
-      } // else DW_CFA_def_cfa_offset
-
-      if (IsRelative)
-        CFAOffset += Src.getOffset();
-      else
-        CFAOffset -= Src.getOffset();
+  MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool IsNested = false;
 
-      continue;
-    }
+  // We need to know if the function has a nest argument only in 64 bit mode.
+  if (Is64Bit)
+    IsNested = HasNestArgument(&MF);
 
-    if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
-      // DW_CFA_def_cfa_register
-      assert(FramePointerReg == -1 && "Defining more than one frame pointer?");
+  // The MOV R10, RAX needs to be in a different block, since the RET we emit in
+  // allocMBB needs to be last (terminating) instruction.
+  MachineBasicBlock *restoreR10MBB = NULL;
+  if (IsNested)
+    restoreR10MBB = MF.CreateMachineBasicBlock();
 
-      if (TRI->getLLVMRegNum(Dst.getReg(), IsEH) != X86::EBP &&
-          TRI->getLLVMRegNum(Dst.getReg(), IsEH) != X86::RBP)
-        // The frame pointer isn't EBP/RBP. Cannot make unwind information
-        // compact.
-        return 0;
+  for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
+         e = prologueMBB.livein_end(); i != e; i++) {
+    allocMBB->addLiveIn(*i);
+    checkMBB->addLiveIn(*i);
 
-      FramePointerReg = TRI->getCompactUnwindRegNum(Dst.getReg(), IsEH);
-      if (SavedRegIdx != 1 || SavedRegs[0] != unsigned(FramePointerReg))
-        return 0;
+    if (IsNested)
+      restoreR10MBB->addLiveIn(*i);
+  }
 
-      SavedRegs[0] = 0;
-      SavedRegIdx = 0;
-      continue;
-    }
+  if (IsNested) {
+    allocMBB->addLiveIn(X86::R10);
+    restoreR10MBB->addLiveIn(X86::RAX);
+  }
 
-    unsigned Reg = Src.getReg();
-    int Offset = Dst.getOffset();
-    if (IsRelative)
-      Offset -= CFAOffset;
-    Offset /= DataAlignmentFactor;
+  if (IsNested)
+    MF.push_front(restoreR10MBB);
+  MF.push_front(allocMBB);
+  MF.push_front(checkMBB);
+
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // Read the limit off the current stacklet off the stack_guard location.
+  if (Is64Bit) {
+    TlsReg = X86::FS;
+    TlsOffset = 0x70;
+
+    BuildMI(checkMBB, DL, TII.get(X86::LEA64r), ScratchReg).addReg(X86::RSP)
+      .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+    BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
+      .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
+  } else {
+    TlsReg = X86::GS;
+    TlsOffset = 0x30;
 
-    if (Offset < 0) {
-      // FIXME: Handle?
-      // DW_CFA_offset_extended_sf
-      return 0;
-    } else if (Reg < 64) {
-      // DW_CFA_offset + Reg
-      if (SavedRegIdx >= 6) return 0;
-      int CURegNum = TRI->getCompactUnwindRegNum(Reg, IsEH);
-      if (CURegNum == -1) return 0;
-      SavedRegs[SavedRegIdx++] = CURegNum;
-    } else {
-      // FIXME: Handle?
-      // DW_CFA_offset_extended
-      return 0;
-    }
+    BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
+      .addImm(0).addReg(0).addImm(-StackSize).addReg(0);
+    BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
+      .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   }
 
-  // Bail if there are too many registers to encode.
-  if (SavedRegIdx > 6) return 0;
+  // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
+  // It jumps to normal execution of the function body.
+  BuildMI(checkMBB, DL, TII.get(X86::JG_4)).addMBB(&prologueMBB);
+
+  // On 32 bit we first push the arguments size and then the frame size. On 64
+  // bit, we pass the stack frame size in r10 and the argument size in r11.
+  if (Is64Bit) {
+    // Functions with nested arguments use R10, so it needs to be saved across
+    // the call to _morestack
+
+    if (IsNested)
+      BuildMI(allocMBB, DL, TII.get(X86::MOV64rr), X86::RAX).addReg(X86::R10);
+
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R10)
+      .addImm(StackSize);
+    BuildMI(allocMBB, DL, TII.get(X86::MOV64ri), X86::R11)
+      .addImm(X86FI->getArgumentStackSize());
+    MF.getRegInfo().setPhysRegUsed(X86::R10);
+    MF.getRegInfo().setPhysRegUsed(X86::R11);
+  } else {
+    // Since we'll call __morestack, stack alignment needs to be preserved.
+    BuildMI(allocMBB, DL, TII.get(X86::SUB32ri), X86::ESP).addReg(X86::ESP)
+      .addImm(8);
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
+      .addImm(StackSize);
+  }
 
-  // Check if the offset is too big.
-  CFAOffset /= 4;
-  if ((CFAOffset & 0xFF) != CFAOffset)
-    return 0;
-  Encoding |= (CFAOffset & 0xFF) << 16; // Size encoding.
-
-  if (FramePointerReg != -1) {
-    Encoding |= 0x01000000;     // EBP/RBP Unwind Frame
-    for (unsigned I = 0; I != SavedRegIdx; ++I) {
-      unsigned Reg = SavedRegs[I];
-      if (Reg == unsigned(FramePointerReg)) continue;
-      Encoding |= (Reg & 0x7) << (I * 3); // Register encoding
-    }
+  // __morestack is in libgcc
+  if (Is64Bit)
+    BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
+      .addExternalSymbol("__morestack");
+  else
+    BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack");
+
+  // __morestack only seems to remove 8 bytes off the stack. Add back the
+  // additional 8 bytes we added before pushing the arguments.
+  if (!Is64Bit)
+    BuildMI(allocMBB, DL, TII.get(X86::ADD32ri), X86::ESP).addReg(X86::ESP)
+      .addImm(8);
+  BuildMI(allocMBB, DL, TII.get(X86::RET));
+
+  if (IsNested)
+    BuildMI(restoreR10MBB, DL, TII.get(X86::MOV64rr), X86::R10)
+      .addReg(X86::RAX);
+
+  if (IsNested) {
+    allocMBB->addSuccessor(restoreR10MBB);
+    restoreR10MBB->addSuccessor(&prologueMBB);
   } else {
-    Encoding |= 0x02000000;     // Frameless unwind with small stack
-    Encoding |= (SavedRegIdx & 0x7) << 10;
-    Encoding |= permuteEncode(SavedRegIdx, SavedRegs);
+    allocMBB->addSuccessor(&prologueMBB);
   }
 
-  return Encoding;
+  checkMBB->addSuccessor(allocMBB);
+  checkMBB->addSuccessor(&prologueMBB);
+
+#ifdef XDEBUG
+  MF.verify();
+#endif
 }
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 14c31ed..6f49064 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -41,6 +41,8 @@ public:
   void emitPrologue(MachineFunction &MF) const;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
 
+  void adjustForSegmentedStacks(MachineFunction &MF) const;
+
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS = NULL) const;
 
@@ -57,11 +59,8 @@ public:
   bool hasFP(const MachineFunction &MF) const;
   bool hasReservedCallFrame(const MachineFunction &MF) const;
 
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-
-  uint32_t getCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs,
-                                    int DataAlignmentFactor, bool IsEH) const;
+  uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 2b0f283..02b0ff2 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -474,10 +474,15 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
     
-    // If the source and destination are SSE registers, then this is a legal
-    // conversion that should not be lowered.
     EVT SrcVT = N->getOperand(0).getValueType();
     EVT DstVT = N->getValueType(0);
+
+    // If any of the sources are vectors, no fp stack involved.
+    if (SrcVT.isVector() || DstVT.isVector())
+      continue;
+
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
     bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
     bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
     if (SrcIsSSE && DstIsSSE)
@@ -2168,9 +2173,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
                                                         MVT::i8, Reg);
 
-        // Emit a testb. No special NOREX tricks are needed since there's
-        // only one GPR operand!
-        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+        // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
+        // target GR8_NOREX registers, so make sure the register class is
+        // forced.
+        return CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, MVT::i32,
                                       Subreg, ShiftedImm);
       }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 5096d9a..7c8ce17 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 using namespace dwarf;
 
@@ -71,9 +72,6 @@ static SDValue Extract128BitVector(SDValue Vec,
                                    SelectionDAG &DAG,
                                    DebugLoc dl);
 
-static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG);
-
-
 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
 /// sets things up to match to an AVX VEXTRACTF128 instruction or a
 /// simple subregister reference.  Idx is an index in the 128 bits we
@@ -85,14 +83,10 @@ static SDValue Extract128BitVector(SDValue Vec,
                                    DebugLoc dl) {
   EVT VT = Vec.getValueType();
   assert(VT.getSizeInBits() == 256 && "Unexpected vector size!");
-
   EVT ElVT = VT.getVectorElementType();
-
-  int Factor = VT.getSizeInBits() / 128;
-
-  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(),
-                                  ElVT,
-                                  VT.getVectorNumElements() / Factor);
+  int Factor = VT.getSizeInBits()/128;
+  EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
+                                  VT.getVectorNumElements()/Factor);
 
   // Extract from UNDEF is UNDEF.
   if (Vec.getOpcode() == ISD::UNDEF)
@@ -111,7 +105,6 @@ static SDValue Extract128BitVector(SDValue Vec,
                                  * ElemsPerChunk);
 
     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
     SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                  VecIdx);
 
@@ -136,21 +129,18 @@ static SDValue Insert128BitVector(SDValue Result,
     assert(VT.getSizeInBits() == 128 && "Unexpected vector size!");
 
     EVT ElVT = VT.getVectorElementType();
-
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-
     EVT ResultVT = Result.getValueType();
 
     // Insert the relevant 128 bits.
-    unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+    unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
 
     // This is the index of the first element of the 128-bit chunk
     // we want.
-    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+    unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
                                  * ElemsPerChunk);
 
     SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
-
     Result = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                          VecIdx);
     return Result;
@@ -159,34 +149,6 @@ static SDValue Insert128BitVector(SDValue Result,
   return SDValue();
 }
 
-/// Given two vectors, concat them.
-static SDValue ConcatVectors(SDValue Lower, SDValue Upper, SelectionDAG &DAG) {
-  DebugLoc dl = Lower.getDebugLoc();
-
-  assert(Lower.getValueType() == Upper.getValueType() && "Mismatched vectors!");
-
-  EVT VT = EVT::getVectorVT(*DAG.getContext(),
-                            Lower.getValueType().getVectorElementType(),
-                            Lower.getValueType().getVectorNumElements() * 2);
-
-  // TODO: Generalize to arbitrary vector length (this assumes 256-bit vectors).
-  assert(VT.getSizeInBits() == 256 && "Unsupported vector concat!");
-
-  // Insert the upper subvector.
-  SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Upper,
-                                   DAG.getConstant(
-                                     // This is half the length of the result
-                                     // vector.  Start inserting the upper 128
-                                     // bits here.
-                                     Lower.getValueType().getVectorNumElements(),
-                                     MVT::i32),
-                                   DAG, dl);
-
-  // Insert the lower subvector.
-  Vec = Insert128BitVector(Vec, Lower, DAG.getConstant(0, MVT::i32), DAG, dl);
-  return Vec;
-}
-
 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
   bool is64Bit = Subtarget->is64Bit();
@@ -197,11 +159,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
     return new TargetLoweringObjectFileMachO();
   }
 
-  if (Subtarget->isTargetELF()) {
-    if (is64Bit)
-      return new X8664_ELFTargetObjectFile(TM);
-    return new X8632_ELFTargetObjectFile(TM);
-  }
+  if (Subtarget->isTargetELF())
+    return new TargetLoweringObjectFileELF();
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
     return new TargetLoweringObjectFileCOFF();
   llvm_unreachable("unknown subtarget type");
@@ -222,6 +181,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   // X86 is weird, it always uses i8 for shift amounts and setcc results.
   setBooleanContents(ZeroOrOneBooleanContent);
+  // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
+  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // For 64-bit since we have so many registers use the ILP scheduler, for
   // 32-bit code use the register pressure specific scheduling.
@@ -354,7 +315,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
   } else if (!UseSoftFloat) {
-    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+    // Since AVX is a superset of SSE3, only check for SSE here.
+    if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
       // Expand FP_TO_UINT into a select.
       // FIXME: We would like to use a Custom expander here eventually to do
       // the optimal thing for SSE vs. the default expansion in the legalizer.
@@ -417,15 +379,24 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 
-  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
-  setOperationAction(ISD::CTTZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i16  , Custom);
-  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
-  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
-  if (Subtarget->is64Bit()) {
-    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
-    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
+  if (Subtarget->hasBMI()) {
+    setOperationAction(ISD::CTTZ           , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasLZCNT()) {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
+  } else {
+    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
+    if (Subtarget->is64Bit())
+      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
   }
 
   if (Subtarget->hasPOPCNT()) {
@@ -491,8 +462,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   if (Subtarget->hasXMM())
     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 
-  // We may not have a libcall for MEMBARRIER so we should lower this.
   setOperationAction(ISD::MEMBARRIER    , MVT::Other, Custom);
+  setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 
   // On X86 and X86-64, atomic operations are lowered to locked instructions.
   // Locked instructions, in turn, have implicit fence semantics (all memory
@@ -506,9 +477,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     MVT VT = IntVTs[i];
     setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
+    setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
 
   if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
@@ -518,6 +491,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
   }
 
+  if (Subtarget->hasCmpxchg16b()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+  }
+
   // FIXME - use subtarget debug flags
   if (!Subtarget->isTargetDarwin() &&
       !Subtarget->isTargetELF() &&
@@ -539,7 +516,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 
-  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
@@ -556,11 +534,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC,
-                     (Subtarget->is64Bit() ? MVT::i64 : MVT::i32),
-                     (Subtarget->isTargetCOFF()
-                      && !Subtarget->isTargetEnvMacho()
-                      ? Custom : Expand));
+
+  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Custom);
+  else if (EnableSegmentedStacks)
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                       MVT::i64 : MVT::i32, Expand);
 
   if (!UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
@@ -739,7 +722,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SETCC, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
@@ -754,6 +737,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VSELECT,  (MVT::SimpleValueType)VT, Expand);
     for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
          InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
       setTruncStoreAction((MVT::SimpleValueType)VT,
@@ -816,7 +800,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4f32, Custom);
   }
 
   if (!UseSoftFloat && Subtarget->hasXMMInt()) {
@@ -846,10 +830,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 
-    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
 
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
@@ -925,7 +909,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
   }
 
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -944,6 +928,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
     setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
 
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
+
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
     // custom since the immediate controlling the insert encodes additional
@@ -964,10 +954,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
   }
 
-  if (Subtarget->hasSSE2()) {
+  if (Subtarget->hasXMMInt()) {
     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
 
     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
@@ -977,15 +968,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
   }
 
-  if (Subtarget->hasSSE42())
-    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
+  if (Subtarget->hasSSE42() || Subtarget->hasAVX())
+    setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
 
   if (!UseSoftFloat && Subtarget->hasAVX()) {
-    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
-    addRegisterClass(MVT::v32i8, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v32i8,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v16i16, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8i32,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8f32,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4i64,  X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4f64,  X86::VR256RegisterClass);
 
     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
@@ -1005,6 +997,59 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
 
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4f64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i8,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i16, Custom);
+
+    setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
+
+    setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
+    setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
+
+    setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
+
+    setOperationAction(ISD::VSELECT,            MVT::v4f64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v4i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v8i32, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v8f32, Legal);
+
+    setOperationAction(ISD::ADD,               MVT::v4i64, Custom);
+    setOperationAction(ISD::ADD,               MVT::v8i32, Custom);
+    setOperationAction(ISD::ADD,               MVT::v16i16, Custom);
+    setOperationAction(ISD::ADD,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::SUB,               MVT::v4i64, Custom);
+    setOperationAction(ISD::SUB,               MVT::v8i32, Custom);
+    setOperationAction(ISD::SUB,               MVT::v16i16, Custom);
+    setOperationAction(ISD::SUB,               MVT::v32i8, Custom);
+
+    setOperationAction(ISD::MUL,               MVT::v4i64, Custom);
+    setOperationAction(ISD::MUL,               MVT::v8i32, Custom);
+    setOperationAction(ISD::MUL,               MVT::v16i16, Custom);
+    // Don't lower v32i8 because there is no 128-bit byte mul
+
     // Custom lower several nodes for 256-bit types.
     for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
                   i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
@@ -1093,6 +1138,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
@@ -1100,7 +1146,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::FADD);
+  setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::SUB);
+  setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SINT_TO_FP);
@@ -1124,25 +1173,26 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 }
 
 
-MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i8;
+EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
+  if (!VT.isVector()) return MVT::i8;
+  return VT.changeVectorElementTypeToInteger();
 }
 
 
 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
-static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
   if (MaxAlign == 16)
     return;
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
     if (VTy->getBitWidth() == 128)
       MaxAlign = 16;
-  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     unsigned EltAlign = 0;
     getMaxByValAlign(ATy->getElementType(), EltAlign);
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
-  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
       unsigned EltAlign = 0;
       getMaxByValAlign(STy->getElementType(i), EltAlign);
@@ -1159,7 +1209,7 @@ static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
 /// function arguments in the caller parameter area. For X86, aggregates
 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
 /// are at 4-byte boundaries.
-unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
   if (Subtarget->is64Bit()) {
     // Max of 8 and alignment of type.
     unsigned TyAlign = TD->getABITypeAlignment(Ty);
@@ -1203,9 +1253,12 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
          ((DstAlign == 0 || DstAlign >= 16) &&
           (SrcAlign == 0 || SrcAlign >= 16))) &&
         Subtarget->getStackAlignment() >= 16) {
-      if (Subtarget->hasSSE2())
+      if (Subtarget->hasAVX() &&
+          Subtarget->getStackAlignment() >= 32)
+        return MVT::v8f32;
+      if (Subtarget->hasXMMInt())
         return MVT::v4i32;
-      if (Subtarget->hasSSE1())
+      if (Subtarget->hasXMM())
         return MVT::v4f32;
     } else if (!MemcpyStrSrc && Size >= 8 &&
                !Subtarget->is64Bit() &&
@@ -1408,7 +1461,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
                                   ValToCopy);
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
-          if (!Subtarget->hasSSE2())
+          if (!Subtarget->hasXMMInt())
             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
         }
       }
@@ -1700,6 +1753,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     // places.
     assert(VA.getValNo() != LastVal &&
            "Don't support value assigned to multiple locs yet");
+    (void)LastVal;
     LastVal = VA.getValNo();
 
     if (VA.isRegLoc()) {
@@ -1917,6 +1971,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   }
 
+  FuncInfo->setArgumentStackSize(StackSize);
+
   return Chain;
 }
 
@@ -2744,8 +2800,6 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPS:
-  case X86ISD::VUNPCKLPD:
   case X86ISD::VUNPCKLPSY:
   case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
@@ -2754,10 +2808,17 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
+  case X86ISD::VUNPCKHPSY:
+  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKHWD:
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
   case X86ISD::PUNPCKHQDQ:
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
+  case X86ISD::VPERM2F128:
     return true;
   }
   return false;
@@ -2783,6 +2844,10 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
   }
 
@@ -2796,6 +2861,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PALIGN:
   case X86ISD::SHUFPD:
   case X86ISD::SHUFPS:
+  case X86ISD::VPERM2F128:
     return DAG.getNode(Opc, dl, VT, V1, V2,
                        DAG.getConstant(TargetMask, MVT::i8));
   }
@@ -2815,8 +2881,6 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPS:
-  case X86ISD::VUNPCKLPD:
   case X86ISD::VUNPCKLPSY:
   case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
@@ -2825,6 +2889,8 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
+  case X86ISD::VUNPCKHPSY:
+  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKHWD:
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
@@ -3026,6 +3092,17 @@ static bool isUndefOrInRange(int Val, int Low, int Hi) {
   return (Val < 0) || (Val >= Low && Val < Hi);
 }
 
+/// isUndefOrInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// range (L, L+Pos]. or is undef.
+static bool isUndefOrInRange(const SmallVectorImpl<int> &Mask,
+                             int Pos, int Size, int Low, int Hi) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i)
+    if (!isUndefOrInRange(Mask[i], Low, Hi))
+      return false;
+  return true;
+}
+
 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
 /// specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
@@ -3034,6 +3111,17 @@ static bool isUndefOrEqual(int Val, int CmpVal) {
   return false;
 }
 
+/// isSequentialOrUndefInRange - Return true if every element in Mask, begining
+/// from position Pos and ending in Pos+Size, falls within the specified
+/// sequential range (L, L+Pos]. or is undef.
+static bool isSequentialOrUndefInRange(const SmallVectorImpl<int> &Mask,
+                                       int Pos, int Size, int Low) {
+  for (int i = Pos, e = Pos+Size; i != e; ++i, ++Low)
+    if (!isUndefOrEqual(Mask[i], Low))
+      return false;
+  return true;
+}
+
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
@@ -3104,11 +3192,13 @@ bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PALIGNR.
 static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                          bool hasSSSE3) {
+                          bool hasSSSE3OrAVX) {
   int i, e = VT.getVectorNumElements();
+  if (VT.getSizeInBits() != 128 && VT.getSizeInBits() != 64)
+    return false;
 
   // Do not handle v2i64 / v2f64 shuffles with palignr.
-  if (e < 4 || !hasSSSE3)
+  if (e < 4 || !hasSSSE3OrAVX)
     return false;
 
   for (i = 0; i != e; ++i)
@@ -3119,42 +3209,176 @@ static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
   if (i == e)
     return false;
 
-  // Determine if it's ok to perform a palignr with only the LHS, since we
-  // don't have access to the actual shuffle elements to see if RHS is undef.
-  bool Unary = Mask[i] < (int)e;
-  bool NeedsUnary = false;
+  // Make sure we're shifting in the right direction.
+  if (Mask[i] <= i)
+    return false;
 
   int s = Mask[i] - i;
 
   // Check the rest of the elements to see if they are consecutive.
   for (++i; i != e; ++i) {
     int m = Mask[i];
-    if (m < 0)
-      continue;
+    if (m >= 0 && m != s+i)
+      return false;
+  }
+  return true;
+}
+
+/// isVSHUFPSYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPSY.
+static bool isVSHUFPSYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                          const X86Subtarget *Subtarget) {
+  int NumElems = VT.getVectorNumElements();
+
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  if (NumElems != 8)
+    return false;
 
-    Unary = Unary && (m < (int)e);
-    NeedsUnary = NeedsUnary || (m < s);
+  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
+  //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
+  //
+  //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
+  //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
+  //
+  int QuarterSize = NumElems/4;
+  int HalfSize = QuarterSize*2;
+  for (int i = 0; i < QuarterSize; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+      return false;
+  for (int i = QuarterSize; i < QuarterSize*2; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+      return false;
 
-    if (NeedsUnary && !Unary)
+  // The mask of the second half must be the same as the first but with
+  // the appropriate offsets. This works in the same way as VPERMILPS
+  // works with masks.
+  for (int i = QuarterSize*2; i < QuarterSize*3; ++i) {
+    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+      return false;
+    int FstHalfIdx = i-HalfSize;
+    if (Mask[FstHalfIdx] < 0)
+      continue;
+    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
       return false;
-    if (Unary && m != ((s+i) & (e-1)))
+  }
+  for (int i = QuarterSize*3; i < NumElems; ++i) {
+    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
       return false;
-    if (!Unary && m != (s+i))
+    int FstHalfIdx = i-HalfSize;
+    if (Mask[FstHalfIdx] < 0)
+      continue;
+    if (!isUndefOrEqual(Mask[i], Mask[FstHalfIdx]+HalfSize))
       return false;
+
   }
+
   return true;
 }
 
-bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
-  SmallVector<int, 8> M;
-  N->getMask(M);
-  return ::isPALIGNRMask(M, N->getValueType(0), true);
+/// getShuffleVSHUFPSYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPSY instruction.
+static unsigned getShuffleVSHUFPSYImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  assert(NumElems == 8 && VT.getSizeInBits() == 256 &&
+         "Only supports v8i32 and v8f32 types");
+
+  int HalfSize = NumElems/2;
+  unsigned Mask = 0;
+  for (int i = 0; i != NumElems ; ++i) {
+    if (SVOp->getMaskElt(i) < 0)
+      continue;
+    // The mask of the first half must be equal to the second one.
+    unsigned Shamt = (i%HalfSize)*2;
+    unsigned Elt = SVOp->getMaskElt(i) % HalfSize;
+    Mask |= Elt << Shamt;
+  }
+
+  return Mask;
+}
+
+/// isVSHUFPDYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// VSHUFPDY. This shuffle doesn't have the same restriction as the PS
+/// version and the mask of the second half isn't binded with the first
+/// one.
+static bool isVSHUFPDYMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                           const X86Subtarget *Subtarget) {
+  int NumElems = VT.getVectorNumElements();
+
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  if (NumElems != 4)
+    return false;
+
+  // VSHUFPSY divides the resulting vector into 4 chunks.
+  // The sources are also splitted into 4 chunks, and each destination
+  // chunk must come from a different source chunk.
+  //
+  //  SRC1 =>      X3       X2       X1       X0
+  //  SRC2 =>      Y3       Y2       Y1       Y0
+  //
+  //  DST  =>  Y2..Y3,  X2..X3,  Y1..Y0,  X1..X0
+  //
+  int QuarterSize = NumElems/4;
+  int HalfSize = QuarterSize*2;
+  for (int i = 0; i < QuarterSize; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, HalfSize))
+      return false;
+  for (int i = QuarterSize; i < QuarterSize*2; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems+HalfSize))
+      return false;
+  for (int i = QuarterSize*2; i < QuarterSize*3; ++i)
+    if (!isUndefOrInRange(Mask[i], HalfSize, NumElems))
+      return false;
+  for (int i = QuarterSize*3; i < NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems+HalfSize, NumElems*2))
+      return false;
+
+  return true;
+}
+
+/// getShuffleVSHUFPDYImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VSHUFPDY instruction.
+static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  assert(NumElems == 4 && VT.getSizeInBits() == 256 &&
+         "Only supports v4i64 and v4f64 types");
+
+  int HalfSize = NumElems/2;
+  unsigned Mask = 0;
+  for (int i = 0; i != NumElems ; ++i) {
+    if (SVOp->getMaskElt(i) < 0)
+      continue;
+    int Elt = SVOp->getMaskElt(i) % HalfSize;
+    Mask |= Elt << i;
+  }
+
+  return Mask;
 }
 
 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// SHUFPS and SHUFPD.
 static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
   int NumElems = VT.getVectorNumElements();
+
+  if (VT.getSizeInBits() != 128)
+    return false;
+
   if (NumElems != 2 && NumElems != 4)
     return false;
 
@@ -3204,7 +3428,13 @@ static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
 bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
-  if (N->getValueType(0).getVectorNumElements() != 4)
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  if (NumElems != 4)
     return false;
 
   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
@@ -3218,15 +3448,19 @@ bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
 bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
-  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (VT.getSizeInBits() != 128)
+    return false;
 
   if (NumElems != 4)
     return false;
 
   return isUndefOrEqual(N->getMaskElt(0), 2) &&
-  isUndefOrEqual(N->getMaskElt(1), 3) &&
-  isUndefOrEqual(N->getMaskElt(2), 2) &&
-  isUndefOrEqual(N->getMaskElt(3), 3);
+         isUndefOrEqual(N->getMaskElt(1), 3) &&
+         isUndefOrEqual(N->getMaskElt(2), 2) &&
+         isUndefOrEqual(N->getMaskElt(3), 3);
 }
 
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -3273,20 +3507,22 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
 static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                          bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
-  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
     return false;
 
-  // Handle vector lengths > 128 bits.  Define a "section" as a set of
-  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
-  // sections.
-  unsigned NumSections = VT.getSizeInBits() / 128;
-  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
-  unsigned NumSectionElts = NumElts / NumSections;
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
 
   unsigned Start = 0;
-  unsigned End = NumSectionElts;
-  for (unsigned s = 0; s < NumSections; ++s) {
-    for (unsigned i = Start, j = s * NumSectionElts;
+  unsigned End = NumLaneElts;
+  for (unsigned s = 0; s < NumLanes; ++s) {
+    for (unsigned i = Start, j = s * NumLaneElts;
          i != End;
          i += 2, ++j) {
       int BitI  = Mask[i];
@@ -3302,8 +3538,8 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
       }
     }
     // Process the next 128 bits.
-    Start += NumSectionElts;
-    End += NumSectionElts;
+    Start += NumLaneElts;
+    End += NumLaneElts;
   }
 
   return true;
@@ -3320,21 +3556,38 @@ bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
 static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
                          bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
-  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unsupported vector type for unpckh");
+
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
     return false;
 
-  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
-    int BitI  = Mask[i];
-    int BitI1 = Mask[i+1];
-    if (!isUndefOrEqual(BitI, j + NumElts/2))
-      return false;
-    if (V2IsSplat) {
-      if (isUndefOrEqual(BitI1, NumElts))
-        return false;
-    } else {
-      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLaneElts = NumElts/NumLanes;
+
+  unsigned Start = 0;
+  unsigned End = NumLaneElts;
+  for (unsigned l = 0; l != NumLanes; ++l) {
+    for (unsigned i = Start, j = (l*NumLaneElts)+NumLaneElts/2;
+                             i != End; i += 2, ++j) {
+      int BitI  = Mask[i];
+      int BitI1 = Mask[i+1];
+      if (!isUndefOrEqual(BitI, j))
         return false;
+      if (V2IsSplat) {
+        if (isUndefOrEqual(BitI1, NumElts))
+          return false;
+      } else {
+        if (!isUndefOrEqual(BitI1, j+NumElts))
+          return false;
+      }
     }
+    // Process the next 128 bits.
+    Start += NumLaneElts;
+    End += NumLaneElts;
   }
   return true;
 }
@@ -3353,16 +3606,21 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
   if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
     return false;
 
-  // Handle vector lengths > 128 bits.  Define a "section" as a set of
-  // 128 bits.  AVX defines UNPCK* to operate independently on 128-bit
-  // sections.
-  unsigned NumSections = VT.getSizeInBits() / 128;
-  if (NumSections == 0 ) NumSections = 1;  // Handle MMX
-  unsigned NumSectionElts = NumElems / NumSections;
+  // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
+  // FIXME: Need a better way to get rid of this, there's no latency difference
+  // between UNPCKLPD and MOVDDUP, the later should always be checked first and
+  // the former later. We should also remove the "_undef" special mask.
+  if (NumElems == 4 && VT.getSizeInBits() == 256)
+    return false;
+
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  unsigned NumLaneElts = NumElems / NumLanes;
 
-  for (unsigned s = 0; s < NumSections; ++s) {
-    for (unsigned i = s * NumSectionElts, j = s * NumSectionElts;
-         i != NumSectionElts * (s + 1);
+  for (unsigned s = 0; s < NumLanes; ++s) {
+    for (unsigned i = s * NumLaneElts, j = s * NumLaneElts;
+         i != NumLaneElts * (s + 1);
          i += 2, ++j) {
       int BitI  = Mask[i];
       int BitI1 = Mask[i+1];
@@ -3433,6 +3691,189 @@ bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
   return ::isMOVLMask(M, N->getValueType(0));
 }
 
+/// isVPERM2F128Mask - Match 256-bit shuffles where the elements are considered
+/// as permutations between 128-bit chunks or halves. As an example: this
+/// shuffle bellow:
+///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
+/// The first half comes from the second half of V1 and the second half from the
+/// the second half of V2.
+static bool isVPERM2F128Mask(const SmallVectorImpl<int> &Mask, EVT VT,
+                             const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256)
+    return false;
+
+  // The shuffle result is divided into half A and half B. In total the two
+  // sources have 4 halves, namely: C, D, E, F. The final values of A and
+  // B must come from C, D, E or F.
+  int HalfSize = VT.getVectorNumElements()/2;
+  bool MatchA = false, MatchB = false;
+
+  // Check if A comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  // Check if B comes from one of C, D, E, F.
+  for (int Half = 0; Half < 4; ++Half) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// getShuffleVPERM2F128Immediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERM2F128 instructions.
+static unsigned getShuffleVPERM2F128Immediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int HalfSize = VT.getVectorNumElements()/2;
+
+  int FstHalf = 0, SndHalf = 0;
+  for (int i = 0; i < HalfSize; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      FstHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+  for (int i = HalfSize; i < HalfSize*2; ++i) {
+    if (SVOp->getMaskElt(i) > 0) {
+      SndHalf = SVOp->getMaskElt(i)/HalfSize;
+      break;
+    }
+  }
+
+  return (FstHalf | (SndHalf << 4));
+}
+
+/// isVPERMILPDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPDMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                            const X86Subtarget *Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  if (!Subtarget->hasAVX())
+    return false;
+
+  // Only match 256-bit with 64-bit types
+  if (VT.getSizeInBits() != 256 || NumElts != 4)
+    return false;
+
+  // The mask on the high lane is independent of the low. Both can match
+  // any element in inside its own lane, but can't cross.
+  int LaneSize = NumElts/NumLanes;
+  for (int l = 0; l < NumLanes; ++l)
+    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+      int LaneStart = l*LaneSize;
+      if (!isUndefOrInRange(Mask[i], LaneStart, LaneStart+LaneSize))
+        return false;
+    }
+
+  return true;
+}
+
+/// isVPERMILPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to VPERMILPS*.
+/// Note that VPERMIL mask matching is different depending whether theunderlying
+/// type is 32 or 64. In the VPERMILPS the high half of the mask should point
+/// to the same elements of the low, but to the higher half of the source.
+/// In VPERMILPD the two lanes could be shuffled independently of each other
+/// with the same restriction that lanes can't be crossed.
+static bool isVPERMILPSMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                            const X86Subtarget *Subtarget) {
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumLanes = VT.getSizeInBits()/128;
+
+  if (!Subtarget->hasAVX())
+    return false;
+
+  // Only match 256-bit with 32-bit types
+  if (VT.getSizeInBits() != 256 || NumElts != 8)
+    return false;
+
+  // The mask on the high lane should be the same as the low. Actually,
+  // they can differ if any of the corresponding index in a lane is undef
+  // and the other stays in range.
+  int LaneSize = NumElts/NumLanes;
+  for (int i = 0; i < LaneSize; ++i) {
+    int HighElt = i+LaneSize;
+    bool HighValid = isUndefOrInRange(Mask[HighElt], LaneSize, NumElts);
+    bool LowValid = isUndefOrInRange(Mask[i], 0, LaneSize);
+
+    if (!HighValid || !LowValid)
+      return false;
+    if (Mask[i] < 0 || Mask[HighElt] < 0)
+      continue;
+    if (Mask[HighElt]-Mask[i] != LaneSize)
+      return false;
+  }
+
+  return true;
+}
+
+/// getShuffleVPERMILPSImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPS* instructions.
+static unsigned getShuffleVPERMILPSImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+  int LaneSize = NumElts/NumLanes;
+
+  // Although the mask is equal for both lanes do it twice to get the cases
+  // where a mask will match because the same mask element is undef on the
+  // first half but valid on the second. This would get pathological cases
+  // such as: shuffle <u, 0, 1, 2, 4, 4, 5, 6>, which is completely valid.
+  unsigned Mask = 0;
+  for (int l = 0; l < NumLanes; ++l) {
+    for (int i = 0; i < LaneSize; ++i) {
+      int MaskElt = SVOp->getMaskElt(i+(l*LaneSize));
+      if (MaskElt < 0)
+        continue;
+      if (MaskElt >= LaneSize)
+        MaskElt -= LaneSize;
+      Mask |= MaskElt << (i*2);
+    }
+  }
+
+  return Mask;
+}
+
+/// getShuffleVPERMILPDImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_MASK mask with VPERMILPD* instructions.
+static unsigned getShuffleVPERMILPDImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VT = SVOp->getValueType(0);
+
+  int NumElts = VT.getVectorNumElements();
+  int NumLanes = VT.getSizeInBits()/128;
+
+  unsigned Mask = 0;
+  int LaneSize = NumElts/NumLanes;
+  for (int l = 0; l < NumLanes; ++l)
+    for (int i = l*LaneSize; i < LaneSize*(l+1); ++i) {
+      int MaskElt = SVOp->getMaskElt(i);
+      if (MaskElt < 0)
+        continue;
+      Mask |= (MaskElt-l*LaneSize) << i;
+    }
+
+  return Mask;
+}
+
 /// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
@@ -3463,58 +3904,92 @@ static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
 
 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
-  if (N->getValueType(0).getVectorNumElements() != 4)
+/// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
+bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
+                         const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
     return false;
 
-  // Expect 1, 1, 3, 3
-  for (unsigned i = 0; i < 2; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt >= 0 && Elt != 1)
-      return false;
-  }
+  // The second vector must be undef
+  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+    return false;
 
-  bool HasHi = false;
-  for (unsigned i = 2; i < 4; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt >= 0 && Elt != 3)
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
+      (VT.getSizeInBits() == 256 && NumElems != 8))
+    return false;
+
+  // "i+1" is the value the indexed mask element must have
+  for (unsigned i = 0; i < NumElems; i += 2)
+    if (!isUndefOrEqual(N->getMaskElt(i), i+1) ||
+        !isUndefOrEqual(N->getMaskElt(i+1), i+1))
       return false;
-    if (Elt == 3)
-      HasHi = true;
-  }
-  // Don't use movshdup if it can be done with a shufps.
-  // FIXME: verify that matching u, u, 3, 3 is what we want.
-  return HasHi;
+
+  return true;
 }
 
 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
-  if (N->getValueType(0).getVectorNumElements() != 4)
+/// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
+bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
+                         const X86Subtarget *Subtarget) {
+  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
+    return false;
+
+  // The second vector must be undef
+  if (N->getOperand(1).getOpcode() != ISD::UNDEF)
+    return false;
+
+  EVT VT = N->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
+      (VT.getSizeInBits() == 256 && NumElems != 8))
     return false;
 
-  // Expect 0, 0, 2, 2
-  for (unsigned i = 0; i < 2; ++i)
-    if (N->getMaskElt(i) > 0)
+  // "i" is the value the indexed mask element must have
+  for (unsigned i = 0; i < NumElems; i += 2)
+    if (!isUndefOrEqual(N->getMaskElt(i), i) ||
+        !isUndefOrEqual(N->getMaskElt(i+1), i))
       return false;
 
-  bool HasHi = false;
-  for (unsigned i = 2; i < 4; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt >= 0 && Elt != 2)
+  return true;
+}
+
+/// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to 256-bit
+/// version of MOVDDUP.
+static bool isMOVDDUPYMask(ShuffleVectorSDNode *N,
+                           const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  int NumElts = VT.getVectorNumElements();
+  bool V2IsUndef = N->getOperand(1).getOpcode() == ISD::UNDEF;
+
+  if (!Subtarget->hasAVX() || VT.getSizeInBits() != 256 ||
+      !V2IsUndef || NumElts != 4)
+    return false;
+
+  for (int i = 0; i != NumElts/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), 0))
       return false;
-    if (Elt == 2)
-      HasHi = true;
-  }
-  // Don't use movsldup if it can be done with a shufps.
-  return HasHi;
+  for (int i = NumElts/2; i != NumElts; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), NumElts/2))
+      return false;
+  return true;
 }
 
 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
-/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+/// specifies a shuffle of elements that is suitable for input to 128-bit
+/// version of MOVDDUP.
 bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
-  int e = N->getValueType(0).getVectorNumElements() / 2;
+  EVT VT = N->getValueType(0);
+
+  if (VT.getSizeInBits() != 128)
+    return false;
 
+  int e = VT.getVectorNumElements() / 2;
   for (int i = 0; i < e; ++i)
     if (!isUndefOrEqual(N->getMaskElt(i), i))
       return false;
@@ -3627,6 +4102,7 @@ unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
     if (Val >= 0)
       break;
   }
+  assert(Val - i > 0 && "PALIGNR imm should be positive");
   return (Val - i) * EltSize;
 }
 
@@ -3644,7 +4120,6 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
   EVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
   return Index / NumElemsPerChunk;
 }
 
@@ -3662,7 +4137,6 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
   EVT ElVT = VecVT.getVectorElementType();
 
   unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
-
   return Index / NumElemsPerChunk;
 }
 
@@ -3716,7 +4190,10 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
 static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
-  if (Op->getValueType(0).getVectorNumElements() != 4)
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+  if (VT.getVectorNumElements() != 4)
     return false;
   for (unsigned i = 0, e = 2; i != e; ++i)
     if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
@@ -3748,6 +4225,10 @@ static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                                ShuffleVectorSDNode *Op) {
+  EVT VT = Op->getValueType(0);
+  if (VT.getSizeInBits() != 128)
+    return false;
+
   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
     return false;
   // Is V2 is a vector load, don't do this transformation. We will try to use
@@ -3755,7 +4236,7 @@ static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
   if (ISD::isNON_EXTLoad(V2))
     return false;
 
-  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+  unsigned NumElems = VT.getVectorNumElements();
 
   if (NumElems != 2 && NumElems != 4)
     return false;
@@ -3811,7 +4292,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
-static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
+static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
@@ -3819,7 +4300,7 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
   // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
   if (VT.getSizeInBits() == 128) {  // SSE
-    if (HasSSE2) {  // SSE2
+    if (HasXMMInt) {  // SSE2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
     } else { // SSE1
@@ -3838,21 +4319,25 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
 }
 
 /// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32> or <8 x i32> bitcasted to
-/// their original type, ensuring they get CSE'd.
+/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
+/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
+/// original type, ensuring they get CSE'd.
 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
   assert((VT.is128BitVector() || VT.is256BitVector())
          && "Expected a 128-bit or 256-bit vector type");
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                            Cst, Cst, Cst, Cst);
 
-  SDValue Vec;
   if (VT.is256BitVector()) {
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
-  } else
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+                              Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+    Vec = Insert128BitVector(InsV, Vec,
+                  DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+  }
+
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
 }
 
@@ -3902,7 +4387,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
 static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
@@ -3915,31 +4400,95 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
 }
 
-/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
-static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
-  EVT PVT = MVT::v4f32;
-  EVT VT = SV->getValueType(0);
-  DebugLoc dl = SV->getDebugLoc();
-  SDValue V1 = SV->getOperand(0);
+// PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
+// a generic shuffle instruction because the target has no such instructions.
+// Generate shuffles which repeat i16 and i8 several times until they can be
+// represented by v4f32 and then be manipulated by target suported shuffles.
+static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
+  EVT VT = V.getValueType();
   int NumElems = VT.getVectorNumElements();
-  int EltNo = SV->getSplatIndex();
+  DebugLoc dl = V.getDebugLoc();
 
-  // unpack elements to the correct location
   while (NumElems > 4) {
     if (EltNo < NumElems/2) {
-      V1 = getUnpackl(DAG, dl, VT, V1, V1);
+      V = getUnpackl(DAG, dl, VT, V, V);
     } else {
-      V1 = getUnpackh(DAG, dl, VT, V1, V1);
+      V = getUnpackh(DAG, dl, VT, V, V);
       EltNo -= NumElems/2;
     }
     NumElems >>= 1;
   }
+  return V;
+}
+
+/// getLegalSplat - Generate a legal splat with supported x86 shuffles
+static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
+  EVT VT = V.getValueType();
+  DebugLoc dl = V.getDebugLoc();
+  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+         && "Vector size not supported");
+
+  if (VT.getSizeInBits() == 128) {
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
+    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+    V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+                             &SplatMask[0]);
+  } else {
+    // To use VPERMILPS to splat scalars, the second half of indicies must
+    // refer to the higher part, which is a duplication of the lower one,
+    // because VPERMILPS can only handle in-lane permutations.
+    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
+                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
+
+    V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+    V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+                             &SplatMask[0]);
+  }
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, V);
+}
+
+/// PromoteSplat - Splat is promoted to target supported vector shuffles.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
+  EVT SrcVT = SV->getValueType(0);
+  SDValue V1 = SV->getOperand(0);
+  DebugLoc dl = SV->getDebugLoc();
+
+  int EltNo = SV->getSplatIndex();
+  int NumElems = SrcVT.getVectorNumElements();
+  unsigned Size = SrcVT.getSizeInBits();
+
+  assert(((Size == 128 && NumElems > 4) || Size == 256) &&
+          "Unknown how to promote splat for type");
+
+  // Extract the 128-bit part containing the splat element and update
+  // the splat element index when it refers to the higher register.
+  if (Size == 256) {
+    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
+    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
+    if (Idx > 0)
+      EltNo -= NumElems/2;
+  }
+
+  // All i16 and i8 vector types can't be used directly by a generic shuffle
+  // instruction because the target has no such instruction. Generate shuffles
+  // which repeat i16 and i8 several times until they fit in i32, and then can
+  // be manipulated by target suported shuffles.
+  EVT EltVT = SrcVT.getVectorElementType();
+  if (EltVT == MVT::i8 || EltVT == MVT::i16)
+    V1 = PromoteSplati8i16(V1, DAG, EltNo);
+
+  // Recreate the 256-bit vector and place the same 128-bit vector
+  // into the low and high part. This is necessary because we want
+  // to use VPERM* to shuffle the vectors
+  if (Size == 256) {
+    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
+                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    V1 = Insert128BitVector(InsV, V1,
+               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+  }
 
-  // Perform the splat.
-  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
-  V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1);
-  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
-  return DAG.getNode(ISD::BITCAST, dl, VT, V1);
+  return getLegalSplat(DAG, V1, EltNo);
 }
 
 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
@@ -3947,11 +4496,11 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
 /// element of V2 is swizzled into the zero/undef vector, landing at element
 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
-                                             bool isZero, bool HasSSE2,
-                                             SelectionDAG &DAG) {
+                                           bool isZero, bool HasXMMInt,
+                                           SelectionDAG &DAG) {
   EVT VT = V2.getValueType();
   SDValue V1 = isZero
-    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+    ? getZeroVector(VT, HasXMMInt, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec;
   for (unsigned i = 0; i != NumElems; ++i)
@@ -4005,6 +4554,8 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       break;
     case X86ISD::UNPCKHPS:
     case X86ISD::UNPCKHPD:
+    case X86ISD::VUNPCKHPSY:
+    case X86ISD::VUNPCKHPDY:
       DecodeUNPCKHPMask(NumElems, ShuffleMask);
       break;
     case X86ISD::PUNPCKLBW:
@@ -4015,8 +4566,6 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       break;
     case X86ISD::UNPCKLPS:
     case X86ISD::UNPCKLPD:
-    case X86ISD::VUNPCKLPS:
-    case X86ISD::VUNPCKLPD:
     case X86ISD::VUNPCKLPSY:
     case X86ISD::VUNPCKLPDY:
       DecodeUNPCKLPMask(VT, ShuffleMask);
@@ -4052,8 +4601,41 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                  Depth+1);
     }
+    case X86ISD::VPERMILPS:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERMILPSY:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERMILPD:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERMILPDY:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                        ShuffleMask);
+      break;
+    case X86ISD::VPERM2F128:
+      ImmN = N->getOperand(N->getNumOperands()-1);
+      DecodeVPERM2F128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(),
+                           ShuffleMask);
+      break;
+    case X86ISD::MOVDDUP:
+    case X86ISD::MOVLHPD:
+    case X86ISD::MOVLPD:
+    case X86ISD::MOVLPS:
+    case X86ISD::MOVSHDUP:
+    case X86ISD::MOVSLDUP:
+    case X86ISD::PALIGN:
+      return SDValue(); // Not yet implemented.
     default:
-      assert("not implemented for target shuffle node");
+      assert(0 && "unknown target shuffle node");
       return SDValue();
     }
 
@@ -4205,6 +4787,11 @@ static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
 /// logical left or right shift of a vector.
 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  // Although the logic below support any bitwidth size, there are no
+  // shift instructions which handle more than 128-bit vectors.
+  if (SVOp->getValueType(0).getSizeInBits() > 128)
+    return false;
+
   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
     return true;
@@ -4295,6 +4882,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
                          const TargetLowering &TLI, DebugLoc dl) {
+  assert(VT.getSizeInBits() == 128 && "Unknown type for VShift");
   EVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
@@ -4333,42 +4921,52 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
       return SDValue();
     }
 
+    // FIXME: 256-bit vector instructions don't require a strict alignment,
+    // improve this code to support it better.
+    unsigned RequiredAlign = VT.getSizeInBits()/8;
     SDValue Chain = LD->getChain();
-    // Make sure the stack object alignment is at least 16.
+    // Make sure the stack object alignment is at least 16 or 32.
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-    if (DAG.InferPtrAlignment(Ptr) < 16) {
+    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
       if (MFI->isFixedObjectIndex(FI)) {
         // Can't change the alignment. FIXME: It's possible to compute
         // the exact stack offset and reference FI + adjust offset instead.
         // If someone *really* cares about this. That's the way to implement it.
         return SDValue();
       } else {
-        MFI->setObjectAlignment(FI, 16);
+        MFI->setObjectAlignment(FI, RequiredAlign);
       }
     }
 
-    // (Offset % 16) must be multiple of 4. Then address is then
+    // (Offset % 16 or 32) must be multiple of 4. Then address is then
     // Ptr + (Offset & ~15).
     if (Offset < 0)
       return SDValue();
-    if ((Offset % 16) & 3)
+    if ((Offset % RequiredAlign) & 3)
       return SDValue();
-    int64_t StartOffset = Offset & ~15;
+    int64_t StartOffset = Offset & ~(RequiredAlign-1);
     if (StartOffset)
       Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
 
     int EltNo = (Offset - StartOffset) >> 2;
-    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
-    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
-    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
+    int NumElems = VT.getVectorNumElements();
+
+    EVT CanonVT = VT.getSizeInBits() == 128 ? MVT::v4i32 : MVT::v8i32;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
+    SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
                              LD->getPointerInfo().getWithOffset(StartOffset),
                              false, false, 0);
-    // Canonicalize it to a v4i32 shuffle.
-    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
-    return DAG.getNode(ISD::BITCAST, dl, VT,
-                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
-                                            DAG.getUNDEF(MVT::v4i32),&Mask[0]));
+
+    // Canonicalize it to a v4i32 or v8i32 shuffle.
+    SmallVector<int, 8> Mask;
+    for (int i = 0; i < NumElems; ++i)
+      Mask.push_back(EltNo);
+
+    V1 = DAG.getNode(ISD::BITCAST, dl, CanonVT, V1);
+    return DAG.getNode(ISD::BITCAST, dl, NVT,
+                       DAG.getVectorShuffle(CanonVT, dl, V1,
+                                            DAG.getUNDEF(CanonVT),&Mask[0]));
   }
 
   return SDValue();
@@ -4428,12 +5026,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                        LDBase->getPointerInfo(),
                        LDBase->isVolatile(), LDBase->isNonTemporal(),
                        LDBase->getAlignment());
-  } else if (NumElems == 4 && LastLoadedElt == 1) {
+  } else if (NumElems == 4 && LastLoadedElt == 1 &&
+             DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
-    SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
-                                              Ops, 2, MVT::i32,
-                                              LDBase->getMemOperand());
+    SDValue ResNode =
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, 2, MVT::i64,
+                                LDBase->getPointerInfo(),
+                                LDBase->getAlignment(),
+                                false/*isVolatile*/, true/*ReadMem*/,
+                                false/*WriteMem*/);
     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   }
   return SDValue();
@@ -4445,47 +5047,26 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   EVT VT = Op.getValueType();
   EVT ExtVT = VT.getVectorElementType();
-
   unsigned NumElems = Op.getNumOperands();
 
-  // For AVX-length vectors, build the individual 128-bit pieces and
-  // use shuffles to put them in place.
-  if (VT.getSizeInBits() > 256 &&
-      Subtarget->hasAVX() &&
-      !ISD::isBuildVectorAllZeros(Op.getNode())) {
-    SmallVector<SDValue, 8> V;
-    V.resize(NumElems);
-    for (unsigned i = 0; i < NumElems; ++i) {
-      V[i] = Op.getOperand(i);
-    }
-
-    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
-
-    // Build the lower subvector.
-    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
-    // Build the upper subvector.
-    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
-                                NumElems/2);
+  // Vectors containing all zeros can be matched by pxor and xorps later
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
+    // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
+    if (Op.getValueType() == MVT::v4i32 ||
+        Op.getValueType() == MVT::v8i32)
+      return Op;
 
-    return ConcatVectors(Lower, Upper, DAG);
+    return getZeroVector(Op.getValueType(), Subtarget->hasXMMInt(), DAG, dl);
   }
 
-  // All zero's:
-  //  - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX)
-  // All one's:
-  //  - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX)
-  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
-      ISD::isBuildVectorAllOnes(Op.getNode())) {
-    // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to
-    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
-    // eliminated on x86-32 hosts.
-    if (Op.getValueType() == MVT::v4i32 ||
-        Op.getValueType() == MVT::v8i32)
+  // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
+  // vectors or broken into v4i32 operations on 256-bit vectors.
+  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+    if (Op.getValueType() == MVT::v4i32)
       return Op;
 
-    if (ISD::isBuildVectorAllOnes(Op.getNode()))
-      return getOnesVector(Op.getValueType(), DAG, dl);
-    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+    return getOnesVector(Op.getValueType(), DAG, dl);
   }
 
   unsigned EVTBits = ExtVT.getSizeInBits();
@@ -4538,7 +5119,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasSSE2(), DAG);
+                                           Subtarget->hasXMMInt(), DAG);
 
         // Now we have our 32-bit value zero extended in the low element of
         // a vector.  If Idx != 0, swizzle it into place.
@@ -4566,7 +5147,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
-        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+        return getShuffleVectorZeroOrUndef(Item, 0, true,Subtarget->hasXMMInt(),
                                            DAG);
       } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
@@ -4574,7 +5155,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         EVT MiddleVT = MVT::v4i32;
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
-                                           Subtarget->hasSSE2(), DAG);
+                                           Subtarget->hasXMMInt(), DAG);
         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
       }
     }
@@ -4603,7 +5184,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
       // Turn it into a shuffle of zero and zero-extended scalar to vector.
       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
-                                         Subtarget->hasSSE2(), DAG);
+                                         Subtarget->hasXMMInt(), DAG);
       SmallVector<int, 8> MaskVec;
       for (unsigned i = 0; i < NumElems; i++)
         MaskVec.push_back(i == Idx ? 0 : 1);
@@ -4631,6 +5212,27 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (IsAllConstants)
     return SDValue();
 
+  // For AVX-length vectors, build the individual 128-bit pieces and use
+  // shuffles to put them in place.
+  if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
+    SmallVector<SDValue, 32> V;
+    for (unsigned i = 0; i < NumElems; ++i)
+      V.push_back(Op.getOperand(i));
+
+    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+
+    // Build both the lower and upper subvector.
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
+                                NumElems/2);
+
+    // Recreate the wider vector with the lower and upper part.
+    SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
+                                DAG.getConstant(0, MVT::i32), DAG, dl);
+    return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
+                              DAG, dl);
+  }
+
   // Let legalizer expand 2-wide build_vectors.
   if (EVTBits == 64) {
     if (NumNonZero == 1) {
@@ -4639,7 +5241,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                  Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true,
-                                         Subtarget->hasSSE2(), DAG);
+                                         Subtarget->hasXMMInt(), DAG);
     }
     return SDValue();
   }
@@ -4664,7 +5266,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     for (unsigned i = 0; i < 4; ++i) {
       bool isZero = !(NonZeros & (1 << i));
       if (isZero)
-        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+        V[i] = getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
       else
         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
     }
@@ -4708,7 +5310,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return LD;
 
     // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (getSubtarget()->hasSSE41()) {
+    if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) {
       SDValue Result;
       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -4758,13 +5360,12 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
-  // We support concatenate two MMX registers and place them in a MMX
-  // register.  This is better than doing a stack convert.
+// LowerMMXCONCAT_VECTORS - We support concatenate two MMX registers and place
+// them in a MMX register.  This is better than doing a stack convert.
+static SDValue LowerMMXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   EVT ResVT = Op.getValueType();
-  assert(Op.getNumOperands() == 2);
+
   assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
          ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
   int Mask[2];
@@ -4785,6 +5386,42 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
 }
 
+// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
+// to create 256-bit vectors from two other 128-bit ones.
+static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT ResVT = Op.getValueType();
+
+  assert(ResVT.getSizeInBits() == 256 && "Value type must be 256-bit wide");
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = ResVT.getVectorNumElements();
+
+  SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, ResVT), V1,
+                                 DAG.getConstant(0, MVT::i32), DAG, dl);
+  return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+                            DAG, dl);
+}
+
+SDValue
+X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+  EVT ResVT = Op.getValueType();
+
+  assert(Op.getNumOperands() == 2);
+  assert((ResVT.getSizeInBits() == 128 || ResVT.getSizeInBits() == 256) &&
+         "Unsupported CONCAT_VECTORS for value type");
+
+  // We support concatenate two MMX registers and place them in a MMX register.
+  // This is better than doing a stack convert.
+  if (ResVT.is128BitVector())
+    return LowerMMXCONCAT_VECTORS(Op, DAG);
+
+  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+  // from two other 128-bit ones.
+  return LowerAVXCONCAT_VECTORS(Op, DAG);
+}
+
 // v8i16 shuffles - Prefer shuffles in the following order:
 // 1. [all]   pshuflw, pshufhw, optional move
 // 2. [ssse3] 1 x pshufb
@@ -4844,7 +5481,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // quads, disable the next transformation since it does not help SSSE3.
   bool V1Used = InputQuads[0] || InputQuads[1];
   bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3()) {
+  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
     if (InputQuads.count() == 2 && V1Used && V2Used) {
       BestLoQuad = InputQuads.find_first();
       BestHiQuad = InputQuads.find_next(BestLoQuad);
@@ -4917,7 +5554,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // If we have SSSE3, and all words of the result are from 1 input vector,
   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3()) {
+  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If we have elements from both input vectors, set the high bit of the
@@ -4985,7 +5622,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                NewV.getOperand(0),
                                X86::getShufflePSHUFLWImmediate(NewV.getNode()),
@@ -5013,7 +5651,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
+        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                               NewV.getOperand(0),
                               X86::getShufflePSHUFHWImmediate(NewV.getNode()),
@@ -5079,7 +5718,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   }
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3()) {
+  if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If all result elements are from one input vector, then only translate
@@ -5276,15 +5915,109 @@ static SDValue getVZextMovL(EVT VT, EVT OpVT,
                                              OpVT, SrcOp)));
 }
 
-/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
-/// shuffles.
+/// areShuffleHalvesWithinDisjointLanes - Check whether each half of a vector
+/// shuffle node referes to only one lane in the sources.
+static bool areShuffleHalvesWithinDisjointLanes(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  int HalfSize = NumElems/2;
+  SmallVector<int, 16> M;
+  SVOp->getMask(M);
+  bool MatchA = false, MatchB = false;
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, 0, HalfSize, l, l+HalfSize)) {
+      MatchA = true;
+      break;
+    }
+  }
+
+  for (int l = 0; l < NumElems*2; l += HalfSize) {
+    if (isUndefOrInRange(M, HalfSize, HalfSize, l, l+HalfSize)) {
+      MatchB = true;
+      break;
+    }
+  }
+
+  return MatchA && MatchB;
+}
+
+/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
+/// which could not be matched by any known target speficic shuffle
+static SDValue
+LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  if (areShuffleHalvesWithinDisjointLanes(SVOp)) {
+    // If each half of a vector shuffle node referes to only one lane in the
+    // source vectors, extract each used 128-bit lane and shuffle them using
+    // 128-bit shuffles. Then, concatenate the results. Otherwise leave
+    // the work to the legalizer.
+    DebugLoc dl = SVOp->getDebugLoc();
+    EVT VT = SVOp->getValueType(0);
+    int NumElems = VT.getVectorNumElements();
+    int HalfSize = NumElems/2;
+
+    // Extract the reference for each half
+    int FstVecExtractIdx = 0, SndVecExtractIdx = 0;
+    int FstVecOpNum = 0, SndVecOpNum = 0;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      FstVecOpNum = Elt/NumElems;
+      FstVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      if (SVOp->getMaskElt(i) < 0)
+        continue;
+      SndVecOpNum = Elt/NumElems;
+      SndVecExtractIdx = Elt % NumElems < HalfSize ? 0 : HalfSize;
+      break;
+    }
+
+    // Extract the subvectors
+    SDValue V1 = Extract128BitVector(SVOp->getOperand(FstVecOpNum),
+                      DAG.getConstant(FstVecExtractIdx, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(SVOp->getOperand(SndVecOpNum),
+                      DAG.getConstant(SndVecExtractIdx, MVT::i32), DAG, dl);
+
+    // Generate 128-bit shuffles
+    SmallVector<int, 16> MaskV1, MaskV2;
+    for (int i = 0; i < HalfSize; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV1.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+    for (int i = HalfSize; i < NumElems; ++i) {
+      int Elt = SVOp->getMaskElt(i);
+      MaskV2.push_back(Elt < 0 ? Elt : Elt % HalfSize);
+    }
+
+    EVT NVT = V1.getValueType();
+    V1 = DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &MaskV1[0]);
+    V2 = DAG.getVectorShuffle(NVT, dl, V2, DAG.getUNDEF(NVT), &MaskV2[0]);
+
+    // Concatenate the result back
+    SDValue V = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), V1,
+                                   DAG.getConstant(0, MVT::i32), DAG, dl);
+    return Insert128BitVector(V, V2, DAG.getConstant(NumElems/2, MVT::i32),
+                              DAG, dl);
+  }
+
+  return SDValue();
+}
+
+/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
+/// 4 elements, and match them with several different shuffle types.
 static SDValue
-LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
   EVT VT = SVOp->getValueType(0);
 
+  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
+
   SmallVector<std::pair<int, int>, 8> Locs;
   Locs.resize(4);
   SmallVector<int, 8> Mask1(4U, -1);
@@ -5542,18 +6275,21 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
 
 static
 SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
-                        bool HasSSE2) {
+                        bool HasXMMInt) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
 
   assert(VT != MVT::v2i64 && "unsupported shuffle type");
 
-  if (HasSSE2 && VT == MVT::v2f64)
+  if (HasXMMInt && VT == MVT::v2f64)
     return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
 
-  // v4f32 or v4i32
-  return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
+  // v4f32 or v4i32: canonizalized to v4f32 (which is legal for SSE1)
+  return DAG.getNode(ISD::BITCAST, dl, VT,
+                     getTargetShuffleNode(X86ISD::MOVLHPS, dl, MVT::v4f32,
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V1),
+                           DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V2), DAG));
 }
 
 static
@@ -5572,8 +6308,24 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
   return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
 }
 
+static inline unsigned getSHUFPOpcode(EVT VT) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v8i32: // Use fp unit for int unpack.
+  case MVT::v8f32:
+  case MVT::v4i32: // Use fp unit for int unpack.
+  case MVT::v4f32: return X86ISD::SHUFPS;
+  case MVT::v4i64: // Use fp unit for int unpack.
+  case MVT::v4f64:
+  case MVT::v2i64: // Use fp unit for int unpack.
+  case MVT::v2f64: return X86ISD::SHUFPD;
+  default:
+    llvm_unreachable("Unknown type for shufp*");
+  }
+  return 0;
+}
+
 static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   EVT VT = Op.getValueType();
@@ -5602,7 +6354,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
     CanFoldLoad = false;
 
   if (CanFoldLoad) {
-    if (HasSSE2 && NumElems == 2)
+    if (HasXMMInt && NumElems == 2)
       return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
 
     if (NumElems == 4)
@@ -5616,28 +6368,30 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   // this is horrible, but will stay like this until we move all shuffle
   // matching to x86 specific nodes. Note that for the 1st condition all
   // types are matched with movsd.
-  if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
-    return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
-  else if (HasSSE2)
+  if (HasXMMInt) {
+    // FIXME: isMOVLMask should be checked and matched before getMOVLP,
+    // as to remove this logic from here, as much as possible
+    if (NumElems == 2 || !X86::isMOVLMask(SVOp))
+      return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
     return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
-
+  }
 
   assert(VT != MVT::v4i32 && "unsupported shuffle type");
 
   // Invert the operand order and use SHUFPS to match it.
-  return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
+  return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V2, V1,
                               X86::getShuffleSHUFImmediate(SVOp), DAG);
 }
 
-static inline unsigned getUNPCKLOpcode(EVT VT, const X86Subtarget *Subtarget) {
+static inline unsigned getUNPCKLOpcode(EVT VT) {
   switch(VT.getSimpleVT().SimpleTy) {
   case MVT::v4i32: return X86ISD::PUNPCKLDQ;
   case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
-  case MVT::v4f32:
-    return Subtarget->hasAVX() ? X86ISD::VUNPCKLPS : X86ISD::UNPCKLPS;
-  case MVT::v2f64:
-    return Subtarget->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
+  case MVT::v4f32: return X86ISD::UNPCKLPS;
+  case MVT::v2f64: return X86ISD::UNPCKLPD;
+  case MVT::v8i32: // Use fp unit for int unpack.
   case MVT::v8f32: return X86ISD::VUNPCKLPSY;
+  case MVT::v4i64: // Use fp unit for int unpack.
   case MVT::v4f64: return X86ISD::VUNPCKLPDY;
   case MVT::v16i8: return X86ISD::PUNPCKLBW;
   case MVT::v8i16: return X86ISD::PUNPCKLWD;
@@ -5653,6 +6407,10 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
   case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
   case MVT::v4f32: return X86ISD::UNPCKHPS;
   case MVT::v2f64: return X86ISD::UNPCKHPD;
+  case MVT::v8i32: // Use fp unit for int unpack.
+  case MVT::v8f32: return X86ISD::VUNPCKHPSY;
+  case MVT::v4i64: // Use fp unit for int unpack.
+  case MVT::v4f64: return X86ISD::VUNPCKHPDY;
   case MVT::v16i8: return X86ISD::PUNPCKHBW;
   case MVT::v8i16: return X86ISD::PUNPCKHWD;
   default:
@@ -5661,6 +6419,68 @@ static inline unsigned getUNPCKHOpcode(EVT VT) {
   return 0;
 }
 
+static inline unsigned getVPERMILOpcode(EVT VT) {
+  switch(VT.getSimpleVT().SimpleTy) {
+  case MVT::v4i32:
+  case MVT::v4f32: return X86ISD::VPERMILPS;
+  case MVT::v2i64:
+  case MVT::v2f64: return X86ISD::VPERMILPD;
+  case MVT::v8i32:
+  case MVT::v8f32: return X86ISD::VPERMILPSY;
+  case MVT::v4i64:
+  case MVT::v4f64: return X86ISD::VPERMILPDY;
+  default:
+    llvm_unreachable("Unknown type for vpermil");
+  }
+  return 0;
+}
+
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
+/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
+static bool isVectorBroadcast(SDValue &Op) {
+  EVT VT = Op.getValueType();
+  bool Is256 = VT.getSizeInBits() == 256;
+
+  assert((VT.getSizeInBits() == 128 || Is256) &&
+         "Unsupported type for vbroadcast node");
+
+  SDValue V = Op;
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (Is256 && !(V.hasOneUse() &&
+                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+                 V.getOperand(0).getOpcode() == ISD::UNDEF))
+    return false;
+
+  if (Is256)
+    V = V.getOperand(1);
+
+  if (!V.hasOneUse())
+    return false;
+
+  // Check the source scalar_to_vector type. 256-bit broadcasts are
+  // supported for 32/64-bit sizes, while 128-bit ones are only supported
+  // for 32-bit scalars.
+  if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+
+  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
+  if (ScalarSize != 32 && ScalarSize != 64)
+    return false;
+  if (!Is256 && ScalarSize == 64)
+    return false;
+
+  V = V.getOperand(0);
+  if (!MayFoldLoad(V))
+    return false;
+
+  // Return the load node
+  Op = V;
+  return true;
+}
+
 static
 SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                const TargetLowering &TLI,
@@ -5672,23 +6492,29 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
   SDValue V2 = Op.getOperand(1);
 
   if (isZeroShuffle(SVOp))
-    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+    return getZeroVector(VT, Subtarget->hasXMMInt(), DAG, dl);
 
   // Handle splat operations
   if (SVOp->isSplat()) {
-    // Special case, this is the only place now where it's
-    // allowed to return a vector_shuffle operation without
-    // using a target specific node, because *hopefully* it
-    // will be optimized away by the dag combiner.
-    if (VT.getVectorNumElements() <= 4 &&
-        CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
+    unsigned NumElem = VT.getVectorNumElements();
+    int Size = VT.getSizeInBits();
+    // Special case, this is the only place now where it's allowed to return
+    // a vector_shuffle operation without using a target specific node, because
+    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
+    // this be moved to DAGCombine instead?
+    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
       return Op;
 
-    // Handle splats by matching through known masks
-    if (VT.getVectorNumElements() <= 4)
+    // Use vbroadcast whenever the splat comes from a foldable load
+    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+
+    // Handle splats by matching through known shuffle masks
+    if ((Size == 128 && NumElem <= 4) ||
+        (Size == 256 && NumElem < 8))
       return SDValue();
 
-    // Canonicalize all of the remaining to v4f32.
+    // All remaning splats are promoted to target supported vector shuffles.
     return PromoteSplat(SVOp, DAG);
   }
 
@@ -5698,7 +6524,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
-  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+  } else if ((VT == MVT::v4i32 ||
+             (VT == MVT::v4f32 && Subtarget->hasXMMInt()))) {
     // FIXME: Figure out a cleaner way to do this.
     // Try to make use of movq to zero out the top part.
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
@@ -5731,9 +6558,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
   bool V1IsSplat = false;
   bool V2IsSplat = false;
-  bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
-  bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
-  bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX();
+  bool HasXMMInt = Subtarget->hasXMMInt();
   MachineFunction &MF = DAG.getMachineFunction();
   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
 
@@ -5765,21 +6590,20 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   // unpckh_undef). Only use pshufd if speed is more important than size.
   if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
   if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
 
-  if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
-      RelaxedMayFoldVectorLoad(V1))
+  if (X86::isMOVDDUPMask(SVOp) &&
+      (Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      V2IsUndef && RelaxedMayFoldVectorLoad(V1))
     return getMOVDDup(Op, dl, V1, DAG);
 
   if (X86::isMOVHLPS_v_undef_Mask(SVOp))
     return getMOVHighToLow(Op, dl, DAG);
 
   // Use to match splats
-  if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
+  if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
       (VT == MVT::v2f64 || VT == MVT::v2i64))
     return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
 
@@ -5792,24 +6616,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
     unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
 
-    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
+    if (HasXMMInt && (VT == MVT::v4f32 || VT == MVT::v4i32))
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);
 
-    if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
-      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
-                                  TargetMask, DAG);
-
-    if (VT == MVT::v4f32)
-      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
-                                  TargetMask, DAG);
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V1,
+                                TargetMask, DAG);
   }
 
   // Check if this can be converted into a logical shift.
   bool isLeft = false;
   unsigned ShAmt = 0;
   SDValue ShVal;
-  bool isShift = getSubtarget()->hasSSE2() &&
-    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  bool isShift = getSubtarget()->hasXMMInt() &&
+                 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   if (isShift && ShVal.hasOneUse()) {
     // If the shifted value has multiple uses, it may be cheaper to use
     // v_set0 + movlhps or movhlps, etc.
@@ -5824,7 +6643,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     if (ISD::isBuildVectorAllZeros(V1.getNode()))
       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
     if (!X86::isMOVLPMask(SVOp)) {
-      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
+      if (HasXMMInt && (VT == MVT::v2i64 || VT == MVT::v2f64))
         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
 
       if (VT == MVT::v4i32 || VT == MVT::v4f32)
@@ -5834,19 +6653,19 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
   // FIXME: fold these into legal mask.
   if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
-    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+    return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
 
   if (X86::isMOVHLPSMask(SVOp))
     return getMOVHighToLow(Op, dl, DAG);
 
-  if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+  if (X86::isMOVSHDUPMask(SVOp, Subtarget))
     return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
 
-  if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+  if (X86::isMOVSLDUPMask(SVOp, Subtarget))
     return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
 
   if (X86::isMOVLPMask(SVOp))
-    return getMOVLP(Op, dl, DAG, HasSSE2);
+    return getMOVLP(Op, dl, DAG, HasXMMInt);
 
   if (ShouldXformToMOVHLPS(SVOp) ||
       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
@@ -5887,8 +6706,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (X86::isUNPCKLMask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
-                                dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
 
   if (X86::isUNPCKHMask(SVOp))
     return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
@@ -5915,8 +6733,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
 
     if (X86::isUNPCKLMask(NewSVOp))
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
-                                  dl, VT, V2, V1, DAG);
+      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
 
     if (X86::isUNPCKHMask(NewSVOp))
       return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
@@ -5932,18 +6749,15 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<int, 16> M;
   SVOp->getMask(M);
 
-  if (isPALIGNRMask(M, VT, HasSSSE3))
+  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()))
     return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
                                 X86::getShufflePALIGNRImmediate(SVOp),
                                 DAG);
 
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
-    if (VT == MVT::v2f64) {
-      X86ISD::NodeType Opcode =
-        getSubtarget()->hasAVX() ? X86ISD::VUNPCKLPD : X86ISD::UNPCKLPD;
-      return getTargetShuffleNode(Opcode, dl, VT, V1, V1, DAG);
-    }
+    if (VT == MVT::v2f64)
+      return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
     if (VT == MVT::v2i64)
       return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
   }
@@ -5958,23 +6772,54 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 X86::getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
-  if (isSHUFPMask(M, VT)) {
-    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
-    if (VT == MVT::v4f32 || VT == MVT::v4i32)
-      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
-                                  TargetMask, DAG);
-    if (VT == MVT::v2f64 || VT == MVT::v2i64)
-      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
-                                  TargetMask, DAG);
-  }
+  if (isSHUFPMask(M, VT))
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+                                X86::getShuffleSHUFImmediate(SVOp), DAG);
 
   if (X86::isUNPCKL_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKLOpcode(VT, getSubtarget()),
-                                  dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
   if (X86::isUNPCKH_v_undef_Mask(SVOp))
-    if (VT != MVT::v2i64 && VT != MVT::v2f64)
-      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Generate target specific nodes for 128 or 256-bit shuffles only
+  // supported in the AVX instruction set.
+  //
+
+  // Handle VMOVDDUPY permutations
+  if (isMOVDDUPYMask(SVOp, Subtarget))
+    return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
+
+  // Handle VPERMILPS* permutations
+  if (isVPERMILPSMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+                                getShuffleVPERMILPSImmediate(SVOp), DAG);
+
+  // Handle VPERMILPD* permutations
+  if (isVPERMILPDMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
+                                getShuffleVPERMILPDImmediate(SVOp), DAG);
+
+  // Handle VPERM2F128 permutations
+  if (isVPERM2F128Mask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::VPERM2F128, dl, VT, V1, V2,
+                                getShuffleVPERM2F128Immediate(SVOp), DAG);
+
+  // Handle VSHUFPSY permutations
+  if (isVSHUFPSYMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+                                getShuffleVSHUFPSYImmediate(SVOp), DAG);
+
+  // Handle VSHUFPDY permutations
+  if (isVSHUFPDYMask(M, VT, Subtarget))
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
+                                getShuffleVSHUFPDYImmediate(SVOp), DAG);
+
+  //===--------------------------------------------------------------------===//
+  // Since no target specific shuffle was selected for this generic one,
+  // lower it into other known shuffles. FIXME: this isn't true yet, but
+  // this is the plan.
+  //
 
   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   if (VT == MVT::v8i16) {
@@ -5989,9 +6834,14 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return NewOp;
   }
 
-  // Handle all 4 wide cases with a number of shuffles.
-  if (NumElems == 4)
-    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+  // Handle all 128-bit wide vectors with 4 elements, and match them with
+  // several different shuffle types.
+  if (NumElems == 4 && VT.getSizeInBits() == 128)
+    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);
+
+  // Handle general 256-bit shuffles
+  if (VT.is256BitVector())
+    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);
 
   return SDValue();
 }
@@ -6001,6 +6851,10 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                 SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
+    return SDValue();
+
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
                                     Op.getOperand(0), Op.getOperand(1));
@@ -6060,36 +6914,26 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue Vec = Op.getOperand(0);
   EVT VecVT = Vec.getValueType();
 
-  // If this is a 256-bit vector result, first extract the 128-bit
-  // vector and then extract from the 128-bit vector.
-  if (VecVT.getSizeInBits() > 128) {
+  // If this is a 256-bit vector result, first extract the 128-bit vector and
+  // then extract the element from the 128-bit vector.
+  if (VecVT.getSizeInBits() == 256) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     unsigned NumElems = VecVT.getVectorNumElements();
     SDValue Idx = Op.getOperand(1);
-
-    if (!isa<ConstantSDNode>(Idx))
-      return SDValue();
-
-    unsigned ExtractNumElems = NumElems / (VecVT.getSizeInBits() / 128);
     unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
 
     // Get the 128-bit vector.
-    bool Upper = IdxVal >= ExtractNumElems;
-    Vec = Extract128BitVector(Vec, Idx, DAG, dl);
-
-    // Extract from it.
-    SDValue ScaledIdx = Idx;
-    if (Upper)
-      ScaledIdx = DAG.getNode(ISD::SUB, dl, Idx.getValueType(), Idx,
-                              DAG.getConstant(ExtractNumElems,
-                                              Idx.getValueType()));
+    bool Upper = IdxVal >= NumElems/2;
+    Vec = Extract128BitVector(Vec,
+                    DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);
+
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
-                       ScaledIdx);
+                    Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
   }
 
   assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
 
-  if (Subtarget->hasSSE41()) {
+  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
     if (Res.getNode())
       return Res;
@@ -6120,7 +6964,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
       return Op;
 
     // SHUFPS the element to the lowest double word, then movss.
-    int Mask[4] = { Idx, -1, -1, -1 };
+    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
     EVT VVT = Op.getOperand(0).getValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
@@ -6159,6 +7003,9 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
+  if (VT.getSizeInBits() == 256)
+    return SDValue();
+
   if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
       isa<ConstantSDNode>(N2)) {
     unsigned Opc;
@@ -6206,35 +7053,28 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
-  // If this is a 256-bit vector result, first insert into a 128-bit
-  // vector and then insert into the 256-bit vector.
-  if (VT.getSizeInBits() > 128) {
+  // If this is a 256-bit vector result, first extract the 128-bit vector,
+  // insert the element into the extracted half and then place it back.
+  if (VT.getSizeInBits() == 256) {
     if (!isa<ConstantSDNode>(N2))
       return SDValue();
 
-    // Get the 128-bit vector.
+    // Get the desired 128-bit vector half.
     unsigned NumElems = VT.getVectorNumElements();
     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
-    bool Upper = IdxVal >= NumElems / 2;
-
-    SDValue SubN0 = Extract128BitVector(N0, N2, DAG, dl);
+    bool Upper = IdxVal >= NumElems/2;
+    SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
+    SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);
 
-    // Insert into it.
-    SDValue ScaledN2 = N2;
-    if (Upper)
-      ScaledN2 = DAG.getNode(ISD::SUB, dl, N2.getValueType(), N2,
-                             DAG.getConstant(NumElems /
-                                             (VT.getSizeInBits() / 128),
-                                             N2.getValueType()));
-    Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubN0.getValueType(), SubN0,
-                     N1, ScaledN2);
+    // Insert the element into the desired half.
+    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
+                 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);
 
-    // Insert the 128-bit vector
-    // FIXME: Why UNDEF?
-    return Insert128BitVector(N0, Op, N2, DAG, dl);
+    // Insert the changed part back to the 256-bit vector
+    return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
   }
 
-  if (Subtarget->hasSSE41())
+  if (Subtarget->hasSSE41() || Subtarget->hasAVX())
     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
 
   if (EltVT == MVT::i8)
@@ -6405,12 +7245,17 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
   CodeModel::Model M = getTargetMachine().getCodeModel();
 
   if (Subtarget->isPICStyleRIPRel() &&
-      (M == CodeModel::Small || M == CodeModel::Kernel))
+      (M == CodeModel::Small || M == CodeModel::Kernel)) {
+    if (Subtarget->isTargetDarwin() || Subtarget->isTargetELF())
+      OpFlag = X86II::MO_GOTPCREL;
     WrapperKind = X86ISD::WrapperRIP;
-  else if (Subtarget->isPICStyleGOT())
-    OpFlag = X86II::MO_GOTOFF;
-  else if (Subtarget->isPICStyleStubPIC())
-    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOT;
+  } else if (Subtarget->isPICStyleStubPIC()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+  } else if (Subtarget->isPICStyleStubNoDynamic()) {
+    OpFlag = X86II::MO_DARWIN_NONLAZY;
+  }
 
   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
 
@@ -6427,6 +7272,12 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
                          Result);
   }
 
+  // For symbols that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlag))
+    Result = DAG.getLoad(getPointerTy(), DL, DAG.getEntryNode(), Result,
+                         MachinePointerInfo::getGOT(), false, false, 0);
+
   return Result;
 }
 
@@ -6676,7 +7527,8 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // And our return value (tls address) is in the standard call return value
     // location.
     unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
-    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
+    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy(),
+                              Chain.getValue(1));
   }
 
   assert(false &&
@@ -6922,9 +7774,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
 
   // Load the 32-bit value into an XMM register.
   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
-                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                                         Op.getOperand(0),
-                                         DAG.getIntPtrConstant(0)));
+                             Op.getOperand(0));
+
+  // Zero out the upper parts of the register.
+  Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget->hasXMMInt(),
+                                     DAG);
 
   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                      DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
@@ -7513,6 +8367,9 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
 }
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+  if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
+
   assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -7563,6 +8420,39 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
                      DAG.getConstant(X86CC, MVT::i8), EFLAGS);
 }
 
+// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
+// ones, and then concatenate the result back.
+static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  assert(VT.getSizeInBits() == 256 && Op.getOpcode() == ISD::SETCC &&
+         "Unsupported value type for operation");
+
+  int NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC = Op.getOperand(2);
+  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+  // Issue the operation on the smaller types and concatenate the result back
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+}
+
+
 SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond;
   SDValue Op0 = Op.getOperand(0);
@@ -7575,11 +8465,21 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (isFP) {
     unsigned SSECC = 8;
-    EVT VT0 = Op0.getValueType();
-    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
-    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+    EVT EltVT = Op0.getValueType().getVectorElementType();
+    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+
+    unsigned Opc = EltVT == MVT::f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
     bool Swap = false;
 
+    // SSE Condition code mapping:
+    //  0 - EQ
+    //  1 - LT
+    //  2 - LE
+    //  3 - UNORD
+    //  4 - NEQ
+    //  5 - NLT
+    //  6 - NLE
+    //  7 - ORD
     switch (SetCCOpcode) {
     default: break;
     case ISD::SETOEQ:
@@ -7624,6 +8524,10 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
   }
 
+  // Break 256-bit integer vector compare into smaller ones.
+  if (!isFP && VT.getSizeInBits() == 256)
+    return Lower256IntVSETCC(Op, DAG);
+
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
@@ -7654,6 +8558,13 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
   if (Swap)
     std::swap(Op0, Op1);
 
+  // Check that the operation in question is available (most are plain SSE2,
+  // but PCMPGTQ and PCMPEQQ have different requirements).
+  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX())
+    return SDValue();
+  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX())
+    return SDValue();
+
   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
   // bits of the inputs before performing those operations.
   if (FlipSigns) {
@@ -8014,9 +8925,11 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
-  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) &&
-         "This should be used only on Windows targets");
-  assert(!Subtarget->isTargetEnvMacho());
+  assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows() ||
+          EnableSegmentedStacks) &&
+         "This should be used only on Windows targets or when segmented stacks "
+         "are being used");
+  assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
   DebugLoc dl = Op.getDebugLoc();
 
   // Get the inputs.
@@ -8024,23 +8937,49 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue Size  = Op.getOperand(1);
   // FIXME: Ensure alignment here
 
-  SDValue Flag;
+  bool Is64Bit = Subtarget->is64Bit();
+  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
 
-  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
-  unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+  if (EnableSegmentedStacks) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
-  Flag = Chain.getValue(1);
+    if (Is64Bit) {
+      // The 64 bit implementation of segmented stacks needs to clobber both r10
+      // r11. This makes it impossible to use it along with nested parameters.
+      const Function *F = MF.getFunction();
+
+      for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+           I != E; I++)
+        if (I->hasNestAttr())
+          report_fatal_error("Cannot use segmented stacks with functions that "
+                             "have nested arguments.");
+    }
+
+    const TargetRegisterClass *AddrRegClass =
+      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+    unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
+    Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+    SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
+                                DAG.getRegister(Vreg, SPTy));
+    SDValue Ops1[2] = { Value, Chain };
+    return DAG.getMergeValues(Ops1, 2, dl);
+  } else {
+    SDValue Flag;
+    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
+    Flag = Chain.getValue(1);
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
-  Flag = Chain.getValue(1);
+    Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
+    Flag = Chain.getValue(1);
 
-  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+    Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
 
-  SDValue Ops1[2] = { Chain.getValue(0), Chain };
-  return DAG.getMergeValues(Ops1, 2, dl);
+    SDValue Ops1[2] = { Chain.getValue(0), Chain };
+    return DAG.getMergeValues(Ops1, 2, dl);
+  }
 }
 
 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
@@ -8118,7 +9057,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
 
   EVT ArgVT = Op.getNode()->getValueType(0);
-  const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
@@ -8292,6 +9231,19 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                 DAG.getConstant(X86CC, MVT::i8), Cond);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+  // Arithmetic intrinsics.
+  case Intrinsic::x86_sse3_hadd_ps:
+  case Intrinsic::x86_sse3_hadd_pd:
+  case Intrinsic::x86_avx_hadd_ps_256:
+  case Intrinsic::x86_avx_hadd_pd_256:
+    return DAG.getNode(X86ISD::FHADD, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_sse3_hsub_ps:
+  case Intrinsic::x86_sse3_hsub_pd:
+  case Intrinsic::x86_avx_hsub_ps_256:
+  case Intrinsic::x86_avx_hsub_pd_256:
+    return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
@@ -8535,8 +9487,13 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
                      Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
 }
 
-SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
-                                             SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
+                                                SelectionDAG &DAG) const {
   SDValue Root = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
@@ -8552,8 +9509,8 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
-    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
-    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
+    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
 
     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
 
@@ -8600,9 +9557,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 22),
                                 false, false, 0);
 
-    SDValue Ops[] =
-      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
-    return DAG.getMergeValues(Ops, 2, dl);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -8619,7 +9574,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
       NestReg = X86::ECX;
 
       // Check that ECX wasn't needed by an 'inreg' parameter.
-      const FunctionType *FTy = Func->getFunctionType();
+      FunctionType *FTy = Func->getFunctionType();
       const AttrListPtr &Attrs = Func->getAttributes();
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
@@ -8657,7 +9612,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
 
     // This is storing the opcode for MOV32ri.
     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
-    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
     OutChains[0] = DAG.getStore(Root, dl,
                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
                                 Trmp, MachinePointerInfo(TrmpAddr),
@@ -8682,9 +9637,7 @@ SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 6),
                                 false, false, 1);
 
-    SDValue Ops[] =
-      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
-    return DAG.getMergeValues(Ops, 2, dl);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
   }
 }
 
@@ -8822,8 +9775,58 @@ SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
+// Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
+// ones, and then concatenate the result back.
+static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  assert(VT.getSizeInBits() == 256 && VT.isInteger() &&
+         "Unsupported value type for operation");
+
+  int NumElems = VT.getVectorNumElements();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+  SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+  // Extract the LHS vectors
+  SDValue LHS = Op.getOperand(0);
+  SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+  SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+  // Extract the RHS vectors
+  SDValue RHS = Op.getOperand(1);
+  SDValue RHS1 = Extract128BitVector(RHS, Idx0, DAG, dl);
+  SDValue RHS2 = Extract128BitVector(RHS, Idx1, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType().getSimpleVT();
+  EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
+}
+
+SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType().getSizeInBits() == 256 &&
+         Op.getValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getValueType().getSizeInBits() == 256 &&
+         Op.getValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  return Lower256IntArith(Op, DAG);
+}
+
+SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
+
+  // Decompose 256-bit ops into smaller 128-bit ops.
+  if (VT.getSizeInBits() == 256)
+    return Lower256IntArith(Op, DAG);
+
   assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
   DebugLoc dl = Op.getDebugLoc();
 
@@ -8872,11 +9875,51 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
-
   LLVMContext *Context = DAG.getContext();
 
-  // Must have SSE2.
-  if (!Subtarget->hasSSE2()) return SDValue();
+  if (!Subtarget->hasXMMInt())
+    return SDValue();
+
+  // Decompose 256-bit shifts into smaller 128-bit shifts.
+  if (VT.getSizeInBits() == 256) {
+    int NumElems = VT.getVectorNumElements();
+    MVT EltVT = VT.getVectorElementType().getSimpleVT();
+    EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+    // Extract the two vectors
+    SDValue V1 = Extract128BitVector(R, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue V2 = Extract128BitVector(R, DAG.getConstant(NumElems/2, MVT::i32),
+                                     DAG, dl);
+
+    // Recreate the shift amount vectors
+    SDValue Amt1, Amt2;
+    if (Amt.getOpcode() == ISD::BUILD_VECTOR) {
+      // Constant shift amount
+      SmallVector<SDValue, 4> Amt1Csts;
+      SmallVector<SDValue, 4> Amt2Csts;
+      for (int i = 0; i < NumElems/2; ++i)
+        Amt1Csts.push_back(Amt->getOperand(i));
+      for (int i = NumElems/2; i < NumElems; ++i)
+        Amt2Csts.push_back(Amt->getOperand(i));
+
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt1Csts[0], NumElems/2);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
+                                 &Amt2Csts[0], NumElems/2);
+    } else {
+      // Variable shift amount
+      Amt1 = Extract128BitVector(Amt, DAG.getConstant(0, MVT::i32), DAG, dl);
+      Amt2 = Extract128BitVector(Amt, DAG.getConstant(NumElems/2, MVT::i32),
+                                 DAG, dl);
+    }
+
+    // Issue new vector shifts for the smaller types
+    V1 = DAG.getNode(Op.getOpcode(), dl, NewVT, V1, Amt1);
+    V2 = DAG.getNode(Op.getOpcode(), dl, NewVT, V2, Amt2);
+
+    // Concatenate the result back
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, V1, V2);
+  }
 
   // Optimize shl/srl/sra with constant shift amount.
   if (isSplatVector(Amt.getNode())) {
@@ -8927,9 +9970,6 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower SHL with variable shift amount.
-  // Cannot lower SHL without SSE2 or later.
-  if (!Subtarget->hasSSE2()) return SDValue();
-
   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
@@ -8971,7 +10011,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                     DAG.getConstant(4, MVT::i32));
-    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
     // a += a
     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
 
@@ -8986,13 +10026,13 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
     M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M,
                     DAG.getConstant(2, MVT::i32));
-    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT, R, M, Op);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op, R, M);
     // a += a
     Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op);
 
     // return pblendv(r, r+r, a);
-    R = DAG.getNode(X86ISD::PBLENDVB, dl, VT,
-                    R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op);
+    R = DAG.getNode(ISD::VSELECT, dl, VT, Op,
+                    R, DAG.getNode(ISD::ADD, dl, VT, R, R));
     return R;
   }
   return SDValue();
@@ -9057,8 +10097,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
                   DAG.getConstant(X86::COND_O, MVT::i32),
                   SDValue(Sum.getNode(), 2));
 
-    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
-    return Sum;
+    return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
   }
   }
 
@@ -9071,8 +10110,7 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
                 DAG.getConstant(Cond, MVT::i32),
                 SDValue(Sum.getNode(), 1));
 
-  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
-  return Sum;
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
@@ -9080,8 +10118,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
   SDNode* Node = Op.getNode();
   EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
   EVT VT = Node->getValueType(0);
-
-  if (Subtarget->hasSSE2() && VT.isVector()) {
+  if (Subtarget->hasXMMInt() && VT.isVector()) {
     unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                         ExtraVT.getScalarType().getSizeInBits();
     SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
@@ -9091,11 +10128,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
     switch (VT.getSimpleVT().SimpleTy) {
       default:
         return SDValue();
-      case MVT::v2i64: {
-        SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_q;
-        SRAIntrinsicsID = 0;
-        break;
-      }
       case MVT::v4i32: {
         SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
         SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
@@ -9115,12 +10147,9 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
     // In case of 1 bit sext, no need to shr
     if (ExtraVT.getScalarType().getSizeInBits() == 1) return Tmp1;
 
-    if (SRAIntrinsicsID) {
-      Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                         DAG.getConstant(SRAIntrinsicsID, MVT::i32),
-                         Tmp1, ShAmt);
-    }
-    return Tmp1;
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(SRAIntrinsicsID, MVT::i32),
+                       Tmp1, ShAmt);
   }
 
   return SDValue();
@@ -9132,7 +10161,7 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
 
   // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
   // There isn't any reason to disable it if the target processor supports it.
-  if (!Subtarget->hasSSE2() && !Subtarget->is64Bit()) {
+  if (!Subtarget->hasXMMInt() && !Subtarget->is64Bit()) {
     SDValue Chain = Op.getOperand(0);
     SDValue Zero = DAG.getConstant(0, MVT::i32);
     SDValue Ops[] = {
@@ -9172,6 +10201,45 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
   return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 }
 
+SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
+    cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
+  SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+    cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // The only fence that needs an instruction is a sequentially-consistent
+  // cross-thread fence.
+  if (FenceOrdering == SequentiallyConsistent && FenceScope == CrossThread) {
+    // Use mfence if we have SSE2 or we're on x86-64 (even if we asked for
+    // no-sse2). There isn't any reason to disable it if the target processor
+    // supports it.
+    if (Subtarget->hasXMMInt() || Subtarget->is64Bit())
+      return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
+
+    SDValue Chain = Op.getOperand(0);
+    SDValue Zero = DAG.getConstant(0, MVT::i32);
+    SDValue Ops[] = {
+      DAG.getRegister(X86::ESP, MVT::i32), // Base
+      DAG.getTargetConstant(1, MVT::i8),   // Scale
+      DAG.getRegister(0, MVT::i32),        // Index
+      DAG.getTargetConstant(0, MVT::i32),  // Disp
+      DAG.getRegister(0, MVT::i32),        // Segment.
+      Zero,
+      Chain
+    };
+    SDNode *Res =
+      DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops,
+                         array_lengthof(Ops));
+    return SDValue(Res, 0);
+  }
+
+  // MEMBARRIER is a compiler barrier; it codegens to a no-op.
+  return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+}
+
+
 SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
   EVT T = Op.getValueType();
   DebugLoc DL = Op.getDebugLoc();
@@ -9227,7 +10295,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
                                             SelectionDAG &DAG) const {
   EVT SrcVT = Op.getOperand(0).getValueType();
   EVT DstVT = Op.getValueType();
-  assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
+  assert(Subtarget->is64Bit() && !Subtarget->hasXMMInt() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
@@ -9255,7 +10323,34 @@ SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
                        Node->getOperand(0),
                        Node->getOperand(1), negOp,
                        cast<AtomicSDNode>(Node)->getSrcValue(),
-                       cast<AtomicSDNode>(Node)->getAlignment());
+                       cast<AtomicSDNode>(Node)->getAlignment(),
+                       cast<AtomicSDNode>(Node)->getOrdering(),
+                       cast<AtomicSDNode>(Node)->getSynchScope());
+}
+
+static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert seq_cst store -> xchg
+  // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
+  // FIXME: On 32-bit, store -> fist or movq would be more efficient
+  //        (The only way to get a 16-byte store is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
+  if (cast<AtomicSDNode>(Node)->getOrdering() == SequentiallyConsistent ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    return Swap.getValue(1);
+  }
+  // Other atomic stores have a simple pattern.
+  return Op;
 }
 
 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
@@ -9291,8 +10386,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
   case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
+  case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
@@ -9318,7 +10415,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
   case ISD::SETCC:              return LowerSETCC(Op, DAG);
-  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
@@ -9332,11 +10428,12 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
-  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
-  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, DAG);
@@ -9352,15 +10449,38 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::ADD:                return LowerADD(Op, DAG);
+  case ISD::SUB:                return LowerSUB(Op, DAG);
   }
 }
 
+static void ReplaceATOMIC_LOAD(SDNode *Node,
+                                  SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) {
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
+
+  // Convert wide load -> cmpxchg8b/cmpxchg16b
+  // FIXME: On 32-bit, load -> fild or movq would be more efficient
+  //        (The only way to get a 16-byte load is cmpxchg16b)
+  // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
+  SDValue Zero = DAG.getConstant(0, VT);
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
+                               Node->getOperand(0),
+                               Node->getOperand(1), Zero, Zero,
+                               cast<AtomicSDNode>(Node)->getMemOperand(),
+                               cast<AtomicSDNode>(Node)->getOrdering(),
+                               cast<AtomicSDNode>(Node)->getSynchScope());
+  Results.push_back(Swap.getValue(0));
+  Results.push_back(Swap.getValue(1));
+}
+
 void X86TargetLowering::
 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
                         SelectionDAG &DAG, unsigned NewOp) const {
-  EVT T = Node->getValueType(0);
   DebugLoc dl = Node->getDebugLoc();
-  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+  assert (Node->getValueType(0) == MVT::i64 &&
+          "Only know how to expand i64 atomics");
 
   SDValue Chain = Node->getOperand(0);
   SDValue In1 = Node->getOperand(1);
@@ -9423,37 +10543,48 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   }
   case ISD::ATOMIC_CMP_SWAP: {
     EVT T = N->getValueType(0);
-    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+    assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
+    bool Regs64bit = T == MVT::i128;
+    EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
     SDValue cpInL, cpInH;
-    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
-                        DAG.getConstant(0, MVT::i32));
-    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
-                        DAG.getConstant(1, MVT::i32));
-    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
-    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
-                             cpInL.getValue(1));
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(0, HalfT));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
+                        DAG.getConstant(1, HalfT));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
+                             Regs64bit ? X86::RAX : X86::EAX,
+                             cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
+                             Regs64bit ? X86::RDX : X86::EDX,
+                             cpInH, cpInL.getValue(1));
     SDValue swapInL, swapInH;
-    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
-                          DAG.getConstant(0, MVT::i32));
-    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
-                          DAG.getConstant(1, MVT::i32));
-    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
-                               cpInH.getValue(1));
-    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
-                               swapInL.getValue(1));
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(0, HalfT));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
+                          DAG.getConstant(1, HalfT));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl,
+                               Regs64bit ? X86::RBX : X86::EBX,
+                               swapInL, cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl,
+                               Regs64bit ? X86::RCX : X86::ECX, 
+                               swapInH, swapInL.getValue(1));
     SDValue Ops[] = { swapInH.getValue(0),
                       N->getOperand(1),
                       swapInH.getValue(1) };
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
-    SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys,
+    unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
+                                  X86ISD::LCMPXCHG8_DAG;
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
                                              Ops, 3, T, MMO);
-    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
-                                        MVT::i32, Result.getValue(1));
-    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
-                                        MVT::i32, cpOutL.getValue(2));
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
+                                        Regs64bit ? X86::RAX : X86::EAX,
+                                        HalfT, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
+                                        Regs64bit ? X86::RDX : X86::EDX,
+                                        HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
     Results.push_back(cpOutH.getValue(1));
     return;
   }
@@ -9478,6 +10609,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::ATOMIC_SWAP:
     ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
     return;
+  case ISD::ATOMIC_LOAD:
+    ReplaceATOMIC_LOAD(N, Results, DAG);
   }
 }
 
@@ -9527,11 +10660,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PSIGNB:             return "X86ISD::PSIGNB";
   case X86ISD::PSIGNW:             return "X86ISD::PSIGNW";
   case X86ISD::PSIGND:             return "X86ISD::PSIGND";
-  case X86ISD::PBLENDVB:           return "X86ISD::PBLENDVB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::FHADD:              return "X86ISD::FHADD";
+  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
@@ -9570,6 +10704,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::ANDN:               return "X86ISD::ANDN";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
@@ -9596,9 +10731,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
   case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
-  case X86ISD::VUNPCKLPS:          return "X86ISD::VUNPCKLPS";
-  case X86ISD::VUNPCKLPD:          return "X86ISD::VUNPCKLPD";
-  case X86ISD::VUNPCKLPSY:         return "X86ISD::VUNPCKLPSY";
   case X86ISD::VUNPCKLPDY:         return "X86ISD::VUNPCKLPDY";
   case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
   case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
@@ -9610,16 +10742,24 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PUNPCKHWD:          return "X86ISD::PUNPCKHWD";
   case X86ISD::PUNPCKHDQ:          return "X86ISD::PUNPCKHDQ";
   case X86ISD::PUNPCKHQDQ:         return "X86ISD::PUNPCKHQDQ";
+  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VPERMILPS:          return "X86ISD::VPERMILPS";
+  case X86ISD::VPERMILPSY:         return "X86ISD::VPERMILPSY";
+  case X86ISD::VPERMILPD:          return "X86ISD::VPERMILPD";
+  case X86ISD::VPERMILPDY:         return "X86ISD::VPERMILPDY";
+  case X86ISD::VPERM2F128:         return "X86ISD::VPERM2F128";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
+  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
+  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
   }
 }
 
 // isLegalAddressingMode - Return true if the addressing mode represented
 // by AM is legal for this target, for a load/store of the specified type.
 bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   // X86 supports extremely general addressing modes.
   CodeModel::Model M = getTargetMachine().getCodeModel();
   Reloc::Model R = getTargetMachine().getRelocationModel();
@@ -9671,7 +10811,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
 }
 
 
-bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
@@ -9691,7 +10831,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   return true;
 }
 
-bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit();
 }
@@ -9715,7 +10855,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
-    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());
+    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX());
 
   // FIXME: pshufb, blends, shifts.
   return (VT.getVectorNumElements() == 2 ||
@@ -9725,7 +10865,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isPSHUFDMask(M, VT) ||
           isPSHUFHWMask(M, VT) ||
           isPSHUFLWMask(M, VT) ||
-          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) ||
           isUNPCKLMask(M, VT) ||
           isUNPCKHMask(M, VT) ||
           isUNPCKL_v_undef_Mask(M, VT) ||
@@ -10158,7 +11298,9 @@ X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.addOperand(Op);
   }
-  BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+  BuildMI(*BB, MI, dl,
+    TII->get(Subtarget->hasAVX() ? X86::VMOVAPSrr : X86::MOVAPSrr),
+             MI->getOperand(0).getReg())
     .addReg(X86::XMM0);
 
   MI->eraseFromParent();
@@ -10513,6 +11655,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
     MBB->addSuccessor(EndMBB);
   }
 
+  unsigned MOVOpc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
   // In the XMM save block, save all the XMM argument registers.
   for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
@@ -10521,7 +11664,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
           MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
         MachineMemOperand::MOStore,
         /*Size=*/16, /*Align=*/16);
-    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+    BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
       .addFrameIndex(RegSaveFrameIndex)
       .addImm(/*Scale=*/1)
       .addReg(/*IndexReg=*/0)
@@ -10565,17 +11708,9 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 
   // If the EFLAGS register isn't dead in the terminator, then claim that it's
   // live into the sink and copy blocks.
-  const MachineFunction *MF = BB->getParent();
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-  BitVector ReservedRegs = TRI->getReservedRegs(*MF);
-
-  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
-    const MachineOperand &MO = MI->getOperand(I);
-    if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
-    unsigned Reg = MO.getReg();
-    if (Reg != X86::EFLAGS) continue;
-    copy0MBB->addLiveIn(Reg);
-    sinkMBB->addLiveIn(Reg);
+  if (!MI->killsRegister(X86::EFLAGS)) {
+    copy0MBB->addLiveIn(X86::EFLAGS);
+    sinkMBB->addLiveIn(X86::EFLAGS);
   }
 
   // Transfer the remainder of BB and its successor edges to sinkMBB.
@@ -10611,6 +11746,119 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
 }
 
 MachineBasicBlock *
+X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
+                                        bool Is64Bit) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+
+  assert(EnableSegmentedStacks);
+
+  unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
+  unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
+
+  // BB:
+  //  ... [Till the alloca]
+  // If stacklet is not large enough, jump to mallocMBB
+  //
+  // bumpMBB:
+  //  Allocate by subtracting from RSP
+  //  Jump to continueMBB
+  //
+  // mallocMBB:
+  //  Allocate by call to runtime
+  //
+  // continueMBB:
+  //  ...
+  //  [rest of original BB]
+  //
+
+  MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetRegisterClass *AddrRegClass =
+    getRegClassFor(Is64Bit ? MVT::i64:MVT::i32);
+
+  unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
+    tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
+    sizeVReg = MI->getOperand(1).getReg(),
+    physSPReg = Is64Bit ? X86::RSP : X86::ESP;
+
+  MachineFunction::iterator MBBIter = BB;
+  ++MBBIter;
+
+  MF->insert(MBBIter, bumpMBB);
+  MF->insert(MBBIter, mallocMBB);
+  MF->insert(MBBIter, continueMBB);
+
+  continueMBB->splice(continueMBB->begin(), BB, llvm::next
+                      (MachineBasicBlock::iterator(MI)), BB->end());
+  continueMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add code to the main basic block to check if the stack limit has been hit,
+  // and if so, jump to mallocMBB otherwise to bumpMBB.
+  BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::SUB64rr:X86::SUB32rr), tmpSPVReg)
+    .addReg(tmpSPVReg).addReg(sizeVReg);
+  BuildMI(BB, DL, TII->get(Is64Bit ? X86::CMP64mr:X86::CMP32mr))
+    .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg)
+    .addReg(tmpSPVReg);
+  BuildMI(BB, DL, TII->get(X86::JG_4)).addMBB(mallocMBB);
+
+  // bumpMBB simply decreases the stack pointer, since we know the current
+  // stacklet has enough space.
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
+    .addReg(tmpSPVReg);
+  BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
+    .addReg(tmpSPVReg);
+  BuildMI(bumpMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Calls into a routine in libgcc to allocate more space from the heap.
+  if (Is64Bit) {
+    BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
+      .addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
+    .addExternalSymbol("__morestack_allocate_stack_space").addReg(X86::RDI);
+  } else {
+    BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
+      .addImm(12);
+    BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
+    BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
+      .addExternalSymbol("__morestack_allocate_stack_space");
+  }
+
+  if (!Is64Bit)
+    BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
+      .addImm(16);
+
+  BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
+    .addReg(Is64Bit ? X86::RAX : X86::EAX);
+  BuildMI(mallocMBB, DL, TII->get(X86::JMP_4)).addMBB(continueMBB);
+
+  // Set up the CFG correctly.
+  BB->addSuccessor(bumpMBB);
+  BB->addSuccessor(mallocMBB);
+  mallocMBB->addSuccessor(continueMBB);
+  bumpMBB->addSuccessor(continueMBB);
+
+  // Take care of the PHI nodes.
+  BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(mallocPtrVReg).addMBB(mallocMBB)
+    .addReg(bumpSPPtrVReg).addMBB(bumpMBB);
+
+  // Delete the original pseudo instruction.
+  MI->eraseFromParent();
+
+  // And we're done.
+  return continueMBB;
+}
+
+MachineBasicBlock *
 X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                           MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
@@ -10718,11 +11966,11 @@ MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
-  default: assert(false && "Unexpected instr type to insert");
+  default: assert(0 && "Unexpected instr type to insert");
   case X86::TAILJMPd64:
   case X86::TAILJMPr64:
   case X86::TAILJMPm64:
-    assert(!"TAILJMP64 would not be touched here.");
+    assert(0 && "TAILJMP64 would not be touched here.");
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
@@ -10745,6 +11993,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return BB;
   case X86::WIN_ALLOCA:
     return EmitLoweredWinAlloca(MI, BB);
+  case X86::SEG_ALLOCA_32:
+    return EmitLoweredSegAlloca(MI, BB, false);
+  case X86::SEG_ALLOCA_64:
+    return EmitLoweredSegAlloca(MI, BB, true);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
@@ -10754,6 +12006,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::CMOV_V4F32:
   case X86::CMOV_V2F64:
   case X86::CMOV_V2I64:
+  case X86::CMOV_V8F32:
+  case X86::CMOV_V4F64:
+  case X86::CMOV_V4I64:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
@@ -11074,6 +12329,33 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
     KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
                                        Mask.getBitWidth() - 1);
     break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntId = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    unsigned NumLoBits = 0;
+    switch (IntId) {
+    default: break;
+    case Intrinsic::x86_sse_movmsk_ps:
+    case Intrinsic::x86_avx_movmsk_ps_256:
+    case Intrinsic::x86_sse2_movmsk_pd:
+    case Intrinsic::x86_avx_movmsk_pd_256:
+    case Intrinsic::x86_mmx_pmovmskb:
+    case Intrinsic::x86_sse2_pmovmskb_128: {
+      // High bits of movmskp{s|d}, pmovmskb are known zero.
+      switch (IntId) {
+        case Intrinsic::x86_sse_movmsk_ps:      NumLoBits = 4; break;
+        case Intrinsic::x86_avx_movmsk_ps_256:  NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_movmsk_pd:     NumLoBits = 2; break;
+        case Intrinsic::x86_avx_movmsk_pd_256:  NumLoBits = 4; break;
+        case Intrinsic::x86_mmx_pmovmskb:       NumLoBits = 8; break;
+        case Intrinsic::x86_sse2_pmovmskb_128:  NumLoBits = 16; break;
+      }
+      KnownZero = APInt::getHighBitsSet(Mask.getBitWidth(),
+                                        Mask.getBitWidth() - NumLoBits);
+      break;
+    }
+    }
+    break;
+  }
   }
 }
 
@@ -11102,23 +12384,132 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
   return TargetLowering::isGAPlusOffset(N, GA, Offset);
 }
 
-/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
-/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
-/// if the load addresses are consecutive, non-overlapping, and in the right
-/// order.
+/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the
+/// same as extracting the high 128-bit part of 256-bit vector and then
+/// inserting the result into the low part of a new 256-bit vector
+static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  for (int i = 0, j = NumElems/2; i < NumElems/2; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the
+/// same as extracting the low 128-bit part of 256-bit vector and then
+/// inserting the result into the high part of a new 256-bit vector
+static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  for (int i = NumElems/2, j = 0; i < NumElems; ++i, ++j)
+    if (!isUndefOrEqual(SVOp->getMaskElt(i), j) ||
+        SVOp->getMaskElt(j) >= 0)
+      return false;
+
+  return true;
+}
+
+/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
+static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  DebugLoc dl = N->getDebugLoc();
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  EVT VT = SVOp->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+
+  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+      V2.getOpcode() == ISD::CONCAT_VECTORS) {
+    //
+    //                   0,0,0,...
+    //                      |
+    //    V      UNDEF    BUILD_VECTOR    UNDEF
+    //     \      /           \           /
+    //  CONCAT_VECTOR         CONCAT_VECTOR
+    //         \                  /
+    //          \                /
+    //          RESULT: V + zero extended
+    //
+    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
+        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
+        V1.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
+      return SDValue();
+
+    // To match the shuffle mask, the first half of the mask should
+    // be exactly the first vector, and all the rest a splat with the
+    // first element of the second one.
+    for (int i = 0; i < NumElems/2; ++i)
+      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
+          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
+        return SDValue();
+
+    // Emit a zeroed vector and insert the desired subvector on its
+    // first half.
+    SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
+                         DAG.getConstant(0, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Combine some shuffles into subvector extracts and inserts:
+  //
+
+  // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
+  if (isShuffleHigh128VectorInsertLow(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(NumElems/2, MVT::i32),
+                                    DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                                      V, DAG.getConstant(0, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
+  if (isShuffleLow128VectorInsertHigh(SVOp)) {
+    SDValue V = Extract128BitVector(V1, DAG.getConstant(0, MVT::i32), DAG, dl);
+    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT),
+                             V, DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
+    return DCI.CombineTo(N, InsV);
+  }
+
+  return SDValue();
+}
+
+/// PerformShuffleCombine - Performs several different shuffle combines.
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
-                                     TargetLowering::DAGCombinerInfo &DCI) {
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget *Subtarget) {
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
-  if (VT.getSizeInBits() != 128)
-    return SDValue();
-
   // Don't create instructions with illegal types after legalize types has run.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
     return SDValue();
 
+  // Combine 256-bit vector shuffles. This is only profitable when in AVX mode
+  if (Subtarget->hasAVX() && VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::VECTOR_SHUFFLE)
+    return PerformShuffleCombine256(N, DAG, DCI);
+
+  // Only handle 128 wide vector from here on.
+  if (VT.getSizeInBits() != 128)
+    return SDValue();
+
+  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
+  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
+  // consecutive, non-overlapping, and in the right order.
   SmallVector<SDValue, 16> Elts;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
@@ -11209,7 +12600,8 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
+/// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
@@ -11217,14 +12609,16 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // Get the LHS/RHS of the select.
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
+  EVT VT = LHS.getValueType();
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
-  if (Subtarget->hasSSE2() &&
-      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
-      Cond.getOpcode() == ISD::SETCC) {
+  if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
+      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      (Subtarget->hasXMMInt() ||
+       (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
     unsigned Opcode = 0;
@@ -11267,7 +12661,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
         // Converting this to a max would handle comparisons between positive
         // and negative zero incorrectly.
         if (!UnsafeFPMath &&
-            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
+            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
           break;
         Opcode = X86ISD::FMAX;
         break;
@@ -11680,7 +13074,7 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
   // all elements are shifted by the same amount.  We can't do this in legalize
   // because the a constant vector is typically transformed to a constant pool
   // so we have no knowledge of the shift amount.
-  if (!Subtarget->hasSSE2())
+  if (!Subtarget->hasXMMInt())
     return SDValue();
 
   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
@@ -11796,7 +13190,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
 
   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
   // we're requiring SSE2 for both.
-  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
+  if (Subtarget->hasXMMInt() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
@@ -11864,6 +13258,36 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
+/// so it can be folded inside ANDNP.
+static bool CanFoldXORWithAllOnes(const SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  // Match direct AllOnes for 128 and 256-bit vectors
+  if (ISD::isBuildVectorAllOnes(N))
+    return true;
+
+  // Look through a bit convert.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  // Sometimes the operand may come from a insert_subvector building a 256-bit
+  // allones vector
+  if (VT.getSizeInBits() == 256 &&
+      N->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    SDValue V1 = N->getOperand(0);
+    SDValue V2 = N->getOperand(1);
+
+    if (V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
+        ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
+        ISD::isBuildVectorAllOnes(V2.getNode()))
+      return true;
+  }
+
+  return false;
+}
+
 static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget *Subtarget) {
@@ -11874,11 +13298,28 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
+  EVT VT = N->getValueType(0);
+
+  // Create ANDN instructions
+  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    DebugLoc DL = N->getDebugLoc();
+
+    // Check LHS for not
+    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
+      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
+    // Check RHS for not
+    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
+      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
+
+    return SDValue();
+  }
+
   // Want to form ANDNP nodes:
   // 1) In the hopes of then easily combining them with OR and AND nodes
   //    to form PBLEND/PSIGN.
   // 2) To match ANDN packed intrinsics
-  EVT VT = N->getValueType(0);
   if (VT != MVT::v2i64 && VT != MVT::v4i64)
     return SDValue();
 
@@ -11888,12 +13329,14 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 
   // Check LHS for vnot
   if (N0.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
 
   // Check RHS for vnot
   if (N1.getOpcode() == ISD::XOR &&
-      ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
+      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
 
   return SDValue();
@@ -11917,7 +13360,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N1 = N->getOperand(1);
 
   // look for psign/blend
-  if (Subtarget->hasSSSE3()) {
+  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
     if (VT == MVT::v2i64) {
       // Canonicalize pandn to RHS
       if (N0.getOpcode() == X86ISD::ANDNP)
@@ -11990,13 +13433,13 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
           }
         }
         // PBLENDVB only available on SSE 4.1
-        if (!Subtarget->hasSSE41())
+        if (!(Subtarget->hasSSE41() || Subtarget->hasAVX()))
           return SDValue();
 
         X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
         Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
         Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
-        Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask);
+        Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y);
         return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
       }
     }
@@ -12057,24 +13500,211 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// PerformLOADCombine - Do target-specific dag combines on LOAD nodes.
+static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  LoadSDNode *Ld = cast<LoadSDNode>(N);
+  EVT RegVT = Ld->getValueType(0);
+  EVT MemVT = Ld->getMemoryVT();
+  DebugLoc dl = Ld->getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  ISD::LoadExtType Ext = Ld->getExtensionType();
+
+  // If this is a vector EXT Load then attempt to optimize it using a
+  // shuffle. We need SSE4 for the shuffles.
+  // TODO: It is possible to support ZExt by zeroing the undef values
+  // during the shuffle phase or after the shuffle.
+  if (RegVT.isVector() && Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+    assert(MemVT != RegVT && "Cannot extend to the same type");
+    assert(MemVT.isVector() && "Must load a vector from memory");
+
+    unsigned NumElems = RegVT.getVectorNumElements();
+    unsigned RegSz = RegVT.getSizeInBits();
+    unsigned MemSz = MemVT.getSizeInBits();
+    assert(RegSz > MemSz && "Register size must be greater than the mem size");
+    // All sizes must be a power of two
+    if (!isPowerOf2_32(RegSz * MemSz * NumElems)) return SDValue();
+
+    // Attempt to load the original value using a single load op.
+    // Find a scalar type which is equal to the loaded word size.
+    MVT SclrLoadTy = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) &&  Tp.getSizeInBits() == MemSz) {
+        SclrLoadTy = Tp;
+        break;
+      }
+    }
+
+    // Proceed if a load word is found.
+    if (SclrLoadTy.getSizeInBits() != MemSz) return SDValue();
+
+    EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
+      RegSz/SclrLoadTy.getSizeInBits());
+
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+                                  RegSz/MemVT.getScalarType().getSizeInBits());
+    // Can't shuffle using an illegal type.
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    // Perform a single load.
+    SDValue ScalarLoad = DAG.getLoad(SclrLoadTy, dl, Ld->getChain(),
+                                  Ld->getBasePtr(),
+                                  Ld->getPointerInfo(), Ld->isVolatile(),
+                                  Ld->isNonTemporal(), Ld->getAlignment());
+
+    // Insert the word loaded into a vector.
+    SDValue ScalarInVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+      LoadUnitVecVT, ScalarLoad);
+
+    // Bitcast the loaded value to a vector of the original element type, in
+    // the size of the target vector type.
+    SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, ScalarInVector);
+    unsigned SizeRatio = RegSz/MemSz;
+
+    // Redistribute the loaded elements into the different locations.
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++) ShuffleVec[i*SizeRatio] = i;
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+                                DAG.getUNDEF(SlicedVec.getValueType()),
+                                ShuffleVec.data());
+
+    // Bitcast to the requested type.
+    Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+    // Replace the original load with the new sequence
+    // and return the new chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Shuff);
+    return SDValue(ScalarLoad.getNode(), 1);
+  }
+
+  return SDValue();
+}
+
 /// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
 static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  EVT StVT = St->getMemoryVT();
+  DebugLoc dl = St->getDebugLoc();
+  SDValue StoredVal = St->getOperand(1);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // If we are saving a concatination of two XMM registers, perform two stores.
+  // This is better in Sandy Bridge cause one 256-bit mem op is done via two
+  // 128-bit ones. If in the future the cost becomes only one memory access the
+  // first version would be better.
+  if (VT.getSizeInBits() == 256 &&
+    StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
+    StoredVal.getNumOperands() == 2) {
+
+    SDValue Value0 = StoredVal.getOperand(0);
+    SDValue Value1 = StoredVal.getOperand(1);
+
+    SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
+    SDValue Ptr0 = St->getBasePtr();
+    SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride);
+
+    SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
+  }
+
+  // Optimize trunc store (of multiple scalars) to shuffle and store.
+  // First, pack all of the elements in one place. Next, store to memory
+  // in fewer chunks.
+  if (St->isTruncatingStore() && VT.isVector()) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    unsigned NumElems = VT.getVectorNumElements();
+    assert(StVT != VT && "Cannot truncate to the same type");
+    unsigned FromSz = VT.getVectorElementType().getSizeInBits();
+    unsigned ToSz = StVT.getVectorElementType().getSizeInBits();
+
+    // From, To sizes and ElemCount must be pow of two
+    if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
+    // We are going to use the original vector elt for storing.
+    // Accumulated smaller vector elements must be a multiple of the store size.
+    if (0 != (NumElems * FromSz) % ToSz) return SDValue();
+
+    unsigned SizeRatio  = FromSz / ToSz;
+
+    assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
+
+    // Create a type on which we perform the shuffle
+    EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StVT.getScalarType(), NumElems*SizeRatio);
+
+    assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SDValue WideVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, St->getValue());
+    SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+    for (unsigned i = 0; i < NumElems; i++ ) ShuffleVec[i] = i * SizeRatio;
+
+    // Can't shuffle using an illegal type
+    if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
+
+    SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
+                                DAG.getUNDEF(WideVec.getValueType()),
+                                ShuffleVec.data());
+    // At this point all of the data is stored at the bottom of the
+    // register. We now need to save it to mem.
+
+    // Find the largest store unit
+    MVT StoreType = MVT::i8;
+    for (unsigned tp = MVT::FIRST_INTEGER_VALUETYPE;
+         tp < MVT::LAST_INTEGER_VALUETYPE; ++tp) {
+      MVT Tp = (MVT::SimpleValueType)tp;
+      if (TLI.isTypeLegal(Tp) && StoreType.getSizeInBits() < NumElems * ToSz)
+        StoreType = Tp;
+    }
+
+    // Bitcast the original vector into a vector of store-size units
+    EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
+            StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
+    assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
+    SDValue ShuffWide = DAG.getNode(ISD::BITCAST, dl, StoreVecVT, Shuff);
+    SmallVector<SDValue, 8> Chains;
+    SDValue Increment = DAG.getConstant(StoreType.getSizeInBits()/8,
+                                        TLI.getPointerTy());
+    SDValue Ptr = St->getBasePtr();
+
+    // Perform one or more big stores into memory.
+    for (unsigned i = 0; i < (ToSz*NumElems)/StoreType.getSizeInBits() ; i++) {
+      SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                   StoreType, ShuffWide,
+                                   DAG.getIntPtrConstant(i));
+      SDValue Ch = DAG.getStore(St->getChain(), dl, SubVec, Ptr,
+                                St->getPointerInfo(), St->isVolatile(),
+                                St->isNonTemporal(), St->getAlignment());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+      Chains.push_back(Ch);
+    }
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
+                               Chains.size());
+  }
+
+
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
   // places to insert EMMS.  This qualifies as a quick hack.
 
   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
-  StoreSDNode *St = cast<StoreSDNode>(N);
-  EVT VT = St->getValue().getValueType();
   if (VT.getSizeInBits() != 64)
     return SDValue();
 
   const Function *F = DAG.getMachineFunction().getFunction();
   bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
   bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
-    && Subtarget->hasSSE2();
+                     && Subtarget->hasXMMInt();
   if ((VT.isVector() ||
        (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
       isa<LoadSDNode>(St->getValue()) &&
@@ -12172,6 +13802,150 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+/// isHorizontalBinOp - Return 'true' if this vector operation is "horizontal"
+/// and return the operands for the horizontal operation in LHS and RHS.  A
+/// horizontal operation performs the binary operation on successive elements
+/// of its first operand, then on successive elements of its second operand,
+/// returning the resulting values in a vector.  For example, if
+///   A = < float a0, float a1, float a2, float a3 >
+/// and
+///   B = < float b0, float b1, float b2, float b3 >
+/// then the result of doing a horizontal operation on A and B is
+///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
+/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
+/// A horizontal-op B, for some already available A and B, and if so then LHS is
+/// set to A, RHS to B, and the routine returns 'true'.
+/// Note that the binary operation should have the property that if one of the
+/// operands is UNDEF then the result is UNDEF.
+static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool isCommutative) {
+  // Look for the following pattern: if
+  //   A = < float a0, float a1, float a2, float a3 >
+  //   B = < float b0, float b1, float b2, float b3 >
+  // and
+  //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
+  //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
+  // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
+  // which is A horizontal-op B.
+
+  // At least one of the operands should be a vector shuffle.
+  if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  EVT VT = LHS.getValueType();
+  unsigned N = VT.getVectorNumElements();
+
+  // View LHS in the form
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  // If LHS is not a shuffle then pretend it is the shuffle
+  //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
+  // NOTE: in what follows a default initialized SDValue represents an UNDEF of
+  // type VT.
+  SDValue A, B;
+  SmallVector<int, 8> LMask(N);
+  if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (LHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      A = LHS.getOperand(0);
+    if (LHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      B = LHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(LHS.getNode())->getMask(LMask);
+  } else {
+    if (LHS.getOpcode() != ISD::UNDEF)
+      A = LHS;
+    for (unsigned i = 0; i != N; ++i)
+      LMask[i] = i;
+  }
+
+  // Likewise, view RHS in the form
+  //   RHS = VECTOR_SHUFFLE C, D, RMask
+  SDValue C, D;
+  SmallVector<int, 8> RMask(N);
+  if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    if (RHS.getOperand(0).getOpcode() != ISD::UNDEF)
+      C = RHS.getOperand(0);
+    if (RHS.getOperand(1).getOpcode() != ISD::UNDEF)
+      D = RHS.getOperand(1);
+    cast<ShuffleVectorSDNode>(RHS.getNode())->getMask(RMask);
+  } else {
+    if (RHS.getOpcode() != ISD::UNDEF)
+      C = RHS;
+    for (unsigned i = 0; i != N; ++i)
+      RMask[i] = i;
+  }
+
+  // Check that the shuffles are both shuffling the same vectors.
+  if (!(A == C && B == D) && !(A == D && B == C))
+    return false;
+
+  // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
+  if (!A.getNode() && !B.getNode())
+    return false;
+
+  // If A and B occur in reverse order in RHS, then "swap" them (which means
+  // rewriting the mask).
+  if (A != C)
+    for (unsigned i = 0; i != N; ++i) {
+      unsigned Idx = RMask[i];
+      if (Idx < N)
+        RMask[i] += N;
+      else if (Idx < 2*N)
+        RMask[i] -= N;
+    }
+
+  // At this point LHS and RHS are equivalent to
+  //   LHS = VECTOR_SHUFFLE A, B, LMask
+  //   RHS = VECTOR_SHUFFLE A, B, RMask
+  // Check that the masks correspond to performing a horizontal operation.
+  for (unsigned i = 0; i != N; ++i) {
+    unsigned LIdx = LMask[i], RIdx = RMask[i];
+
+    // Ignore any UNDEF components.
+    if (LIdx >= 2*N || RIdx >= 2*N || (!A.getNode() && (LIdx < N || RIdx < N))
+        || (!B.getNode() && (LIdx >= N || RIdx >= N)))
+      continue;
+
+    // Check that successive elements are being operated on.  If not, this is
+    // not a horizontal operation.
+    if (!(LIdx == 2*i && RIdx == 2*i + 1) &&
+        !(isCommutative && LIdx == 2*i + 1 && RIdx == 2*i))
+      return false;
+  }
+
+  LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
+  RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
+  return true;
+}
+
+/// PerformFADDCombine - Do target-specific dag combines on floating point adds.
+static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, true))
+    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
+/// PerformFSUBCombine - Do target-specific dag combines on floating point subs.
+static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
+                                  const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Try to synthesize horizontal subs from subs of shuffles.
+  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+      isHorizontalBinOp(LHS, RHS, false))
+    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+  return SDValue();
+}
+
 /// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
 /// X86ISD::FXOR nodes.
 static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
@@ -12326,7 +14100,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
 //      (add Y, (setne X, 0)) -> sbb -1, Y
 //      (sub (sete  X, 0), Y) -> sbb  0, Y
 //      (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) {
+static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
   DebugLoc DL = N->getDebugLoc();
 
   // Look through ZExts.
@@ -12362,6 +14136,31 @@ static SDValue OptimizeConditonalInDecrement(SDNode *N, SelectionDAG &DAG) {
                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
 }
 
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // X86 can't encode an immediate LHS of a sub. See if we can push the
+  // negation into a preceding instruction.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
+    // If the RHS of the sub is a XOR with one use and a constant, invert the
+    // immediate. Then add one to the LHS of the sub so we can turn
+    // X-Y -> X+~Y+1, saving one register.
+    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
+        isa<ConstantSDNode>(Op1.getOperand(1))) {
+      APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
+      EVT VT = Op0.getValueType();
+      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+                                   Op1.getOperand(0),
+                                   DAG.getConstant(~XorC, VT));
+      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+                         DAG.getConstant(C->getAPIntValue()+1, VT));
+    }
+  }
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -12369,10 +14168,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
+  case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
-  case ISD::ADD:
-  case ISD::SUB:            return OptimizeConditonalInDecrement(N, DAG);
+  case ISD::ADD:            return OptimizeConditionalInDecrement(N, DAG);
+  case ISD::SUB:            return PerformSubCombine(N, DAG);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   case ISD::SHL:
@@ -12380,8 +14180,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
   case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
+  case ISD::LOAD:           return PerformLOADCombine(N, DAG, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
+  case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
+  case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
@@ -12398,14 +14201,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PUNPCKHQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
+  case X86ISD::VUNPCKHPSY:
+  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKLBW:
   case X86ISD::PUNPCKLWD:
   case X86ISD::PUNPCKLDQ:
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPS:
-  case X86ISD::VUNPCKLPD:
   case X86ISD::VUNPCKLPSY:
   case X86ISD::VUNPCKLPDY:
   case X86ISD::MOVHLPS:
@@ -12415,7 +14218,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PSHUFLW:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
-  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
+  case X86ISD::VPERMILPS:
+  case X86ISD::VPERMILPSY:
+  case X86ISD::VPERMILPD:
+  case X86ISD::VPERMILPDY:
+  case X86ISD::VPERM2F128:
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   }
 
   return SDValue();
@@ -12551,7 +14359,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
          AsmPieces[1] == "${0:q}")) {
       // No need to check constraints, nothing other than the equivalent of
       // "=r,0" would be valid here.
-      const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
       if (!Ty || Ty->getBitWidth() % 16 != 0)
         return false;
       return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12572,7 +14380,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
           AsmPieces[1] == "~{dirflag}" &&
           AsmPieces[2] == "~{flags}" &&
           AsmPieces[3] == "~{fpsr}") {
-        const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+        IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
         if (!Ty || Ty->getBitWidth() % 16 != 0)
           return false;
         return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12603,7 +14411,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
                 AsmPieces[1] == "~{dirflag}" &&
                 AsmPieces[2] == "~{flags}" &&
                 AsmPieces[3] == "~{fpsr}") {
-              const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
               if (!Ty || Ty->getBitWidth() % 16 != 0)
                 return false;
               return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12629,7 +14437,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
             SplitString(AsmPieces[2], Words, " \t,");
             if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
                 Words[2] == "%edx") {
-              const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+              IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
               if (!Ty || Ty->getBitWidth() % 16 != 0)
                 return false;
               return IntrinsicLowering::LowerToByteSwap(CI);
@@ -12700,7 +14508,7 @@ TargetLowering::ConstraintWeight
     // but allow it at the lowest weight.
   if (CallOperandVal == NULL)
     return CW_Default;
-  const Type *type = CallOperandVal->getType();
+  Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index b603678..342a5e6 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -175,8 +175,14 @@ namespace llvm {
       /// PSIGNB/W/D - Copy integer sign.
       PSIGNB, PSIGNW, PSIGND,
 
-      /// PBLENDVB - Variable blend
-      PBLENDVB,
+      /// BLEND family of opcodes
+      BLENDV,
+
+      /// FHADD - Floating point horizontal add.
+      FHADD,
+
+      /// FHSUB - Floating point horizontal sub.
+      FHSUB,
 
       /// FMAX, FMIN - Floating point max and min.
       ///
@@ -222,6 +228,8 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
+      ANDN, // ANDN - Bitwise AND NOT with FLAGS results.
+
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
 
       // MUL_IMM - X86 specific multiply by immediate.
@@ -257,12 +265,12 @@ namespace llvm {
       MOVSS,
       UNPCKLPS,
       UNPCKLPD,
-      VUNPCKLPS,
-      VUNPCKLPD,
       VUNPCKLPSY,
       VUNPCKLPDY,
       UNPCKHPS,
       UNPCKHPD,
+      VUNPCKHPSY,
+      VUNPCKHPDY,
       PUNPCKLBW,
       PUNPCKLWD,
       PUNPCKLDQ,
@@ -271,6 +279,12 @@ namespace llvm {
       PUNPCKHWD,
       PUNPCKHDQ,
       PUNPCKHQDQ,
+      VPERMILPS,
+      VPERMILPSY,
+      VPERMILPD,
+      VPERMILPDY,
+      VPERM2F128,
+      VBROADCAST,
 
       // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
       // according to %al. An operator is needed so that this can be expanded
@@ -280,6 +294,11 @@ namespace llvm {
       // WIN_ALLOCA - Windows's _chkstk call to do stack probing.
       WIN_ALLOCA,
 
+      // SEG_ALLOCA - For allocating variable amounts of stack space when using
+      // segmented stacks. Check if the current stacklet has enough space, and
+      // falls back to heap allocation if not.
+      SEG_ALLOCA,
+
       // Memory barrier
       MEMBARRIER,
       MFENCE,
@@ -297,9 +316,10 @@ namespace llvm {
       ATOMNAND64_DAG,
       ATOMSWAP64_DAG,
 
-      // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
       LCMPXCHG_DAG,
       LCMPXCHG8_DAG,
+      LCMPXCHG16_DAG,
 
       // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
       VZEXT_LOAD,
@@ -407,20 +427,16 @@ namespace llvm {
 
     /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
-    bool isMOVSHDUPMask(ShuffleVectorSDNode *N);
+    bool isMOVSHDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget);
 
     /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
-    bool isMOVSLDUPMask(ShuffleVectorSDNode *N);
+    bool isMOVSLDUPMask(ShuffleVectorSDNode *N, const X86Subtarget *Subtarget);
 
     /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
     bool isMOVDDUPMask(ShuffleVectorSDNode *N);
 
-    /// isPALIGNRMask - Return true if the specified VECTOR_SHUFFLE operand
-    /// specifies a shuffle of elements that is suitable for input to PALIGNR.
-    bool isPALIGNRMask(ShuffleVectorSDNode *N);
-
     /// isVEXTRACTF128Index - Return true if the specified
     /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
     /// suitable for input to VEXTRACTF128.
@@ -505,7 +521,7 @@ namespace llvm {
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
     /// 4-byte boundaries.
-    virtual unsigned getByValTypeAlignment(const Type *Ty) const;
+    virtual unsigned getByValTypeAlignment(Type *Ty) const;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
@@ -564,8 +580,8 @@ namespace llvm {
     /// DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
-    /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    /// getSetCCResultType - Return the value type to use for ISD::SETCC.
+    virtual EVT getSetCCResultType(EVT VT) const;
 
     /// computeMaskedBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
@@ -617,12 +633,12 @@ namespace llvm {
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
     /// isTruncateFree - Return true if it's free to truncate a value of
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
-    virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
     virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
 
     /// isZExtFree - Return true if any actual instruction that defines a
@@ -633,7 +649,7 @@ namespace llvm {
     /// does not necessarily apply to truncate instructions. e.g. on x86-64,
     /// all instructions that define 32-bit values implicit zero-extend the
     /// result out to 64 bits.
-    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
     virtual bool isZExtFree(EVT VT1, EVT VT2) const;
 
     /// isNarrowingProfitable - Return true if it's profitable to narrow
@@ -813,11 +829,14 @@ namespace llvm {
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
@@ -825,6 +844,7 @@ namespace llvm {
     SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
     // Utility functions to help LowerVECTOR_SHUFFLE
@@ -931,6 +951,10 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI,
                                               MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool Is64Bit) const;
+
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
 
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 9f7a4b0..74b647a 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -650,6 +650,15 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   let isCodeGenOnly = 1;
 }
 
+// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
+class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
+        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
+        mnemonic, "{$src2, $src1|$src1, $src2}", []> {
+  // The disassembler should know about this, but not the asmparser.
+  let isCodeGenOnly = 1;
+}
+
 // BinOpRM - Instructions like "add reg, reg, [mem]".
 class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
               dag outlist, list<dag> pattern>
@@ -857,11 +866,10 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpAI - Instructions like "add %eax, %eax, imm".
 class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg>
+              Register areg, string operands>
   : ITy<opcode, RawFrm, typeinfo,
         (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, !strconcat("{$src, %", areg.AsmName, "|%",
-                               areg.AsmName, ", $src}"), []> {
+        mnemonic, operands, []> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
   let Defs = [areg];
@@ -926,10 +934,14 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def #NAME#32mi   : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
     def #NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
 
-    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL>;
-    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX>;
-    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX>;
-    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX>;
+    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|AL, $src}">;
+    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|AX, $src}">;
+    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|EAX, $src}">;
+    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|RAX, $src}">;
   }                          
 }
 
@@ -993,10 +1005,14 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def #NAME#32mi   : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
     def #NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
 
-    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL>;
-    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX>;
-    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX>;
-    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX>;
+    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|AL, $src}">;
+    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|AX, $src}">;
+    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|EAX, $src}">;
+    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, 
+                              "{$src, %rax|RAX, $src}">;
   }                          
 }
 
@@ -1017,10 +1033,10 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       def #NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
     } // isCommutable
 
-    def #NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-    def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-    def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-    def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+    def #NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+    def #NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+    def #NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+    def #NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
 
     def #NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
     def #NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
@@ -1056,10 +1072,14 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def #NAME#32mi   : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
     def #NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
 
-    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL>;
-    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX>;
-    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX>;
-    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX>;
+    def #NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|AL, $src}">;
+    def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|AX, $src}">;
+    def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|EAX, $src}">;
+    def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|RAX, $src}">;
   }                          
 }
 
@@ -1117,9 +1137,37 @@ let Defs = [EFLAGS] in {
   def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
   def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
                      
-  def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL>;
-  def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX>;
-  def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX>;
-  def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX>;
-}                          
+  def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL,
+                           "{$src, %al|AL, $src}">;
+  def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX,
+                           "{$src, %ax|AX, $src}">;
+  def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX,
+                           "{$src, %eax|EAX, $src}">;
+  def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX,
+                           "{$src, %rax|RAX, $src}">;
+
+  // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+  // register class is constrained to GR8_NOREX.
+  let isPseudo = 1 in
+  def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                        "", []>;
+}
 
+//===----------------------------------------------------------------------===//
+// ANDN Instruction
+//
+multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
+                    PatFrag ld_frag> {
+  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))]>;
+  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+            !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+            [(set RC:$dst, EFLAGS,
+             (X86andn_flag RC:$src1, (ld_frag addr:$src2)))]>;
+}
+
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32>, T8, VEX_4V;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W;
+}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index adcc747..da28690 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -106,6 +106,26 @@ let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
   def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
                      "# dynamic stack allocation",
                      [(X86WinAlloca)]>;
+
+// When using segmented stacks these are lowered into instructions which first
+// check if the current stacklet has enough free memory. If it does, memory is
+// allocated by bumping the stack pointer. Otherwise memory is allocated from 
+// the heap.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP, EAX] in
+def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR32:$dst,
+                         (X86SegAlloca GR32:$size))]>,
+                    Requires<[In32BitMode]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP, RAX] in
+def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca for segmented stacks",
+                      [(set GR64:$dst,
+                         (X86SegAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
+
 }
 
 
@@ -329,18 +349,11 @@ def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions
 
-let Constraints = "$src1 = $dst" in {
-
-// Conditional moves
-let Uses = [EFLAGS] in {
-
 // X86 doesn't have 8-bit conditional moves. Use a customInserter to
 // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
 // however that requires promoting the operands, and can induce additional
-// i8 register pressure. Note that CMOV_GR8 is conservatively considered to
-// clobber EFLAGS, because if one of the operands is zero, the expansion
-// could involve an xor.
-let usesCustomInserter = 1, Constraints = "", Defs = [EFLAGS] in {
+// i8 register pressure.
+let usesCustomInserter = 1, Uses = [EFLAGS] in {
 def CMOV_GR8 : I<0, Pseudo,
                  (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
                  "#CMOV_GR8 PSEUDO!",
@@ -380,10 +393,7 @@ def CMOV_RFP80 : I<0, Pseudo,
                       (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
                                                   EFLAGS))]>;
 } // Predicates = [NoCMov]
-} // UsesCustomInserter = 1, Constraints = "", Defs = [EFLAGS]
-} // Uses = [EFLAGS]
-
-} // Constraints = "$src1 = $dst" in
+} // UsesCustomInserter = 1, Uses = [EFLAGS]
 
 
 //===----------------------------------------------------------------------===//
@@ -532,7 +542,7 @@ def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
 let hasSideEffects = 1 in
 def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
                      "#MEMBARRIER",
-                     [(X86MemBarrier)]>, Requires<[HasSSE2]>;
+                     [(X86MemBarrier)]>;
 
 // TODO: Get this to fold the constant into the instruction.
 let hasSideEffects = 1, Defs = [ESP], isCodeGenOnly = 1 in
@@ -630,8 +640,8 @@ def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
 defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
 defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
 defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
-defm LOCK_AND : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM4m, "and">;
-defm LOCK_XOR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM6m, "xor">;
+defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
+defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
 
 // Optimized codegen when the non-memory output is not used.
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
@@ -665,12 +675,20 @@ def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
 
 // Atomic compare and swap.
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
-    isCodeGenOnly = 1 in {
+    isCodeGenOnly = 1 in
 def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
                "lock\n\t"
                "cmpxchg8b\t$ptr",
                [(X86cas8 addr:$ptr)]>, TB, LOCK;
-}
+
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    isCodeGenOnly = 1 in
+def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
+                    "lock\n\t"
+                    "cmpxchg16b\t$ptr",
+                    [(X86cas16 addr:$ptr)]>, TB, LOCK,
+                    Requires<[HasCmpxchg16b]>;
+
 let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {
 def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
                "lock\n\t"
@@ -695,7 +713,7 @@ def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
 let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in {
 def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
                "lock\n\t"
-               "cmpxchgq\t$swap,$ptr",
+               "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}",
                [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
 }
 
@@ -718,11 +736,37 @@ def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
                 TB, LOCK;
 def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
                "lock\n\t"
-               "xadd\t$val, $ptr",
+               "xadd{q}\t{$val, $ptr|$ptr, $val}",
                [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
                 TB, LOCK;
 }
 
+def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
+def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
+def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
+def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
+                      "#ACQUIRE_MOV PSEUDO!",
+                      [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+
+def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_8  addr:$dst, GR8 :$src)]>;
+def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_16 addr:$dst, GR16:$src)]>;
+def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_32 addr:$dst, GR32:$src)]>;
+def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
+                        "#RELEASE_MOV PSEUDO!",
+                        [(atomic_store_64 addr:$dst, GR64:$src)]>;
+
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions.
 //===----------------------------------------------------------------------===//
@@ -759,6 +803,24 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
                     [(set VR128:$dst,
                       (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
                                           EFLAGS)))]>;
+  def CMOV_V8F32 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V8F32 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4F64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4F64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V4I64 : I<0, Pseudo,
+                    (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
+                    "#CMOV_V4I64 PSEUDO!",
+                    [(set VR256:$dst,
+                      (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
+                                          EFLAGS)))]>;
 }
 
 
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 2e1d523..e62e6b7 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -76,12 +76,12 @@ def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
 // except that they use GR32_NOREX for the output operand register class
 // instead of GR32. This allows them to operate on h registers on x86-64.
 def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
-                         (outs GR32_NOREX:$dst), (ins GR8:$src),
+                         (outs GR32_NOREX:$dst), (ins GR8_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}",
                          []>, TB;
 let mayLoad = 1 in
 def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
-                         (outs GR32_NOREX:$dst), (ins i8mem:$src),
+                         (outs GR32_NOREX:$dst), (ins i8mem_NOREX:$src),
                          "movz{bl|x}\t{$src, $dst|$dst, $src}",
                          []>, TB;
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 6d89bcc..0a1590b 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -113,6 +113,7 @@ class VEX_W  { bit hasVEX_WPrefix = 1; }
 class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
 class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
+class VEX_LIG { bit ignoresVEX_L = 1; }
 class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
@@ -150,6 +151,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasVEX_i8ImmReg = 0;  // Does this inst require the last source register
                             // to be encoded in a immediate field?
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
+  bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
   bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
 
   // TSFlags layout should be kept in sync with X86InstrInfo.h.
@@ -169,7 +171,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{35}    = hasVEX_4VPrefix;
   let TSFlags{36}    = hasVEX_i8ImmReg;
   let TSFlags{37}    = hasVEX_L;
-  let TSFlags{38}    = has3DNow0F0FOpcode;
+  let TSFlags{38}    = ignoresVEX_L;
+  let TSFlags{39}    = has3DNow0F0FOpcode;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -501,6 +504,9 @@ class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
       : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : VPDI<o, F, outs, ins, asm, pattern>, VEX_W;
 
 // MMX Instruction templates
 //
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index b00109c..af919fb 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -39,6 +39,8 @@ def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
+def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
+def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
 def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
@@ -49,18 +51,15 @@ def X86pshufb  : SDNode<"X86ISD::PSHUFB",
 def X86andnp   : SDNode<"X86ISD::ANDNP",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psignb  : SDNode<"X86ISD::PSIGNB", 
+def X86psignb  : SDNode<"X86ISD::PSIGNB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psignw  : SDNode<"X86ISD::PSIGNW", 
+def X86psignw  : SDNode<"X86ISD::PSIGNW",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psignd  : SDNode<"X86ISD::PSIGND", 
+def X86psignd  : SDNode<"X86ISD::PSIGND",
                  SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86pblendv : SDNode<"X86ISD::PBLENDVB", 
-                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
 def X86pextrw  : SDNode<"X86ISD::PEXTRW",
@@ -109,6 +108,8 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisInt<3>]>;
 
+def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+
 def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
@@ -133,12 +134,15 @@ def X86Movhlpd : SDNode<"X86ISD::MOVHLPD", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
-def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
-def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
+def X86Unpcklps  : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
+def X86Unpcklpd  : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
 def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>;
 def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>;
-def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
-def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
+
+def X86Unpckhps  : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
+def X86Unpckhpd  : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
+def X86Unpckhpsy : SDNode<"X86ISD::VUNPCKHPSY", SDTShuff2Op>;
+def X86Unpckhpdy : SDNode<"X86ISD::VUNPCKHPDY", SDTShuff2Op>;
 
 def X86Punpcklbw  : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>;
 def X86Punpcklwd  : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>;
@@ -150,6 +154,15 @@ def X86Punpckhwd  : SDNode<"X86ISD::PUNPCKHWD", SDTShuff2Op>;
 def X86Punpckhdq  : SDNode<"X86ISD::PUNPCKHDQ", SDTShuff2Op>;
 def X86Punpckhqdq : SDNode<"X86ISD::PUNPCKHQDQ", SDTShuff2Op>;
 
+def X86VPermilps  : SDNode<"X86ISD::VPERMILPS", SDTShuff2OpI>;
+def X86VPermilpsy : SDNode<"X86ISD::VPERMILPSY", SDTShuff2OpI>;
+def X86VPermilpd  : SDNode<"X86ISD::VPERMILPD", SDTShuff2OpI>;
+def X86VPermilpdy : SDNode<"X86ISD::VPERMILPDY", SDTShuff2OpI>;
+
+def X86VPerm2f128 : SDNode<"X86ISD::VPERM2F128", SDTShuff3OpI>;
+
+def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
@@ -193,17 +206,28 @@ def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv8i32    : PatFrag<(ops node:$ptr), (v8i32 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
-// Like 'store', but always requires vector alignment.
+// Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{
   return cast<StoreSDNode>(N)->getAlignment() >= 16;
 }]>;
 
-// Like 'load', but always requires vector alignment.
+// Like 'store', but always requires 256-bit vector alignment.
+def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 32;
+}]>;
+
+// Like 'load', but always requires 128-bit vector alignment.
 def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
+// Like 'load', but always requires 256-bit vector alignment.
+def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 32;
+}]>;
+
 def alignedloadfsf32 : PatFrag<(ops node:$ptr),
                                (f32 (alignedload node:$ptr))>;
 def alignedloadfsf64 : PatFrag<(ops node:$ptr),
@@ -221,13 +245,13 @@ def alignedloadv2i64 : PatFrag<(ops node:$ptr),
 
 // 256-bit aligned load pattern fragments
 def alignedloadv8f32 : PatFrag<(ops node:$ptr),
-                               (v8f32 (alignedload node:$ptr))>;
+                               (v8f32 (alignedload256 node:$ptr))>;
 def alignedloadv4f64 : PatFrag<(ops node:$ptr),
-                               (v4f64 (alignedload node:$ptr))>;
+                               (v4f64 (alignedload256 node:$ptr))>;
 def alignedloadv8i32 : PatFrag<(ops node:$ptr),
-                               (v8i32 (alignedload node:$ptr))>;
+                               (v8i32 (alignedload256 node:$ptr))>;
 def alignedloadv4i64 : PatFrag<(ops node:$ptr),
-                               (v4i64 (alignedload node:$ptr))>;
+                               (v4i64 (alignedload256 node:$ptr))>;
 
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
@@ -356,7 +380,7 @@ def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
   return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N));
 }]>;
 
-// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to 
+// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to
 // VINSERTF128 imm.
 def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
   return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
@@ -398,16 +422,6 @@ def movl : PatFrag<(ops node:$lhs, node:$rhs),
   return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
 }]>;
 
-def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
-                       (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
 def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
                      (vector_shuffle node:$lhs, node:$rhs), [{
   return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
@@ -418,16 +432,6 @@ def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
   return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
 }]>;
 
-def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                           (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                           (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
 def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
                      (vector_shuffle node:$lhs, node:$rhs), [{
   return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
@@ -448,11 +452,6 @@ def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
   return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
 }], SHUFFLE_get_pshuflw_imm>;
 
-def palign : PatFrag<(ops node:$lhs, node:$rhs),
-                     (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
-}], SHUFFLE_get_palign_imm>;
-
 def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
                                    (extract_subvector node:$bigvec,
                                                       node:$index), [{
@@ -465,3 +464,4 @@ def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
                                                    node:$index), [{
   return X86::isVINSERTF128Index(N);
 }], INSERT_get_vinsertf128_imm>;
+
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 55b5835..3a02de0 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -53,6 +53,36 @@ ReMatPICStubLoad("remat-pic-stub-load",
                  cl::desc("Re-materialize load from stub in PIC mode"),
                  cl::init(false), cl::Hidden);
 
+enum {
+  // Select which memory operand is being unfolded.
+  // (stored in bits 0 - 7)
+  TB_INDEX_0    = 0,
+  TB_INDEX_1    = 1,
+  TB_INDEX_2    = 2,
+  TB_INDEX_MASK = 0xff,
+
+  // Minimum alignment required for load/store.
+  // Used for RegOp->MemOp conversion.
+  // (stored in bits 8 - 15)
+  TB_ALIGN_SHIFT = 8,
+  TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
+  TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
+  TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT,
+
+  // Do not insert the reverse map (MemOp -> RegOp) into the table.
+  // This may be needed because there is a many -> one mapping.
+  TB_NO_REVERSE   = 1 << 16,
+
+  // Do not insert the forward map (RegOp -> MemOp) into the table.
+  // This is needed for Native Client, which prohibits branch
+  // instructions from using a memory operand.
+  TB_NO_FORWARD   = 1 << 17,
+
+  TB_FOLDED_LOAD  = 1 << 18,
+  TB_FOLDED_STORE = 1 << 19
+};
+
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKDOWN64
@@ -61,655 +91,829 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                      ? X86::ADJCALLSTACKUP64
                      : X86::ADJCALLSTACKUP32)),
     TM(tm), RI(tm, *this) {
-  enum {
-    TB_NOT_REVERSABLE = 1U << 31,
-    TB_FLAGS = TB_NOT_REVERSABLE
-  };
 
-  static const unsigned OpTbl2Addr[][2] = {
-    { X86::ADC32ri,     X86::ADC32mi },
-    { X86::ADC32ri8,    X86::ADC32mi8 },
-    { X86::ADC32rr,     X86::ADC32mr },
-    { X86::ADC64ri32,   X86::ADC64mi32 },
-    { X86::ADC64ri8,    X86::ADC64mi8 },
-    { X86::ADC64rr,     X86::ADC64mr },
-    { X86::ADD16ri,     X86::ADD16mi },
-    { X86::ADD16ri8,    X86::ADD16mi8 },
-    { X86::ADD16ri_DB,  X86::ADD16mi  | TB_NOT_REVERSABLE },
-    { X86::ADD16ri8_DB, X86::ADD16mi8 | TB_NOT_REVERSABLE },
-    { X86::ADD16rr,     X86::ADD16mr },
-    { X86::ADD16rr_DB,  X86::ADD16mr | TB_NOT_REVERSABLE },
-    { X86::ADD32ri,     X86::ADD32mi },
-    { X86::ADD32ri8,    X86::ADD32mi8 },
-    { X86::ADD32ri_DB,  X86::ADD32mi | TB_NOT_REVERSABLE },
-    { X86::ADD32ri8_DB, X86::ADD32mi8 | TB_NOT_REVERSABLE },
-    { X86::ADD32rr,     X86::ADD32mr },
-    { X86::ADD32rr_DB,  X86::ADD32mr | TB_NOT_REVERSABLE },
-    { X86::ADD64ri32,   X86::ADD64mi32 },
-    { X86::ADD64ri8,    X86::ADD64mi8 },
-    { X86::ADD64ri32_DB,X86::ADD64mi32 | TB_NOT_REVERSABLE },
-    { X86::ADD64ri8_DB, X86::ADD64mi8 | TB_NOT_REVERSABLE },
-    { X86::ADD64rr,     X86::ADD64mr },
-    { X86::ADD64rr_DB,  X86::ADD64mr | TB_NOT_REVERSABLE },
-    { X86::ADD8ri,      X86::ADD8mi },
-    { X86::ADD8rr,      X86::ADD8mr },
-    { X86::AND16ri,     X86::AND16mi },
-    { X86::AND16ri8,    X86::AND16mi8 },
-    { X86::AND16rr,     X86::AND16mr },
-    { X86::AND32ri,     X86::AND32mi },
-    { X86::AND32ri8,    X86::AND32mi8 },
-    { X86::AND32rr,     X86::AND32mr },
-    { X86::AND64ri32,   X86::AND64mi32 },
-    { X86::AND64ri8,    X86::AND64mi8 },
-    { X86::AND64rr,     X86::AND64mr },
-    { X86::AND8ri,      X86::AND8mi },
-    { X86::AND8rr,      X86::AND8mr },
-    { X86::DEC16r,      X86::DEC16m },
-    { X86::DEC32r,      X86::DEC32m },
-    { X86::DEC64_16r,   X86::DEC64_16m },
-    { X86::DEC64_32r,   X86::DEC64_32m },
-    { X86::DEC64r,      X86::DEC64m },
-    { X86::DEC8r,       X86::DEC8m },
-    { X86::INC16r,      X86::INC16m },
-    { X86::INC32r,      X86::INC32m },
-    { X86::INC64_16r,   X86::INC64_16m },
-    { X86::INC64_32r,   X86::INC64_32m },
-    { X86::INC64r,      X86::INC64m },
-    { X86::INC8r,       X86::INC8m },
-    { X86::NEG16r,      X86::NEG16m },
-    { X86::NEG32r,      X86::NEG32m },
-    { X86::NEG64r,      X86::NEG64m },
-    { X86::NEG8r,       X86::NEG8m },
-    { X86::NOT16r,      X86::NOT16m },
-    { X86::NOT32r,      X86::NOT32m },
-    { X86::NOT64r,      X86::NOT64m },
-    { X86::NOT8r,       X86::NOT8m },
-    { X86::OR16ri,      X86::OR16mi },
-    { X86::OR16ri8,     X86::OR16mi8 },
-    { X86::OR16rr,      X86::OR16mr },
-    { X86::OR32ri,      X86::OR32mi },
-    { X86::OR32ri8,     X86::OR32mi8 },
-    { X86::OR32rr,      X86::OR32mr },
-    { X86::OR64ri32,    X86::OR64mi32 },
-    { X86::OR64ri8,     X86::OR64mi8 },
-    { X86::OR64rr,      X86::OR64mr },
-    { X86::OR8ri,       X86::OR8mi },
-    { X86::OR8rr,       X86::OR8mr },
-    { X86::ROL16r1,     X86::ROL16m1 },
-    { X86::ROL16rCL,    X86::ROL16mCL },
-    { X86::ROL16ri,     X86::ROL16mi },
-    { X86::ROL32r1,     X86::ROL32m1 },
-    { X86::ROL32rCL,    X86::ROL32mCL },
-    { X86::ROL32ri,     X86::ROL32mi },
-    { X86::ROL64r1,     X86::ROL64m1 },
-    { X86::ROL64rCL,    X86::ROL64mCL },
-    { X86::ROL64ri,     X86::ROL64mi },
-    { X86::ROL8r1,      X86::ROL8m1 },
-    { X86::ROL8rCL,     X86::ROL8mCL },
-    { X86::ROL8ri,      X86::ROL8mi },
-    { X86::ROR16r1,     X86::ROR16m1 },
-    { X86::ROR16rCL,    X86::ROR16mCL },
-    { X86::ROR16ri,     X86::ROR16mi },
-    { X86::ROR32r1,     X86::ROR32m1 },
-    { X86::ROR32rCL,    X86::ROR32mCL },
-    { X86::ROR32ri,     X86::ROR32mi },
-    { X86::ROR64r1,     X86::ROR64m1 },
-    { X86::ROR64rCL,    X86::ROR64mCL },
-    { X86::ROR64ri,     X86::ROR64mi },
-    { X86::ROR8r1,      X86::ROR8m1 },
-    { X86::ROR8rCL,     X86::ROR8mCL },
-    { X86::ROR8ri,      X86::ROR8mi },
-    { X86::SAR16r1,     X86::SAR16m1 },
-    { X86::SAR16rCL,    X86::SAR16mCL },
-    { X86::SAR16ri,     X86::SAR16mi },
-    { X86::SAR32r1,     X86::SAR32m1 },
-    { X86::SAR32rCL,    X86::SAR32mCL },
-    { X86::SAR32ri,     X86::SAR32mi },
-    { X86::SAR64r1,     X86::SAR64m1 },
-    { X86::SAR64rCL,    X86::SAR64mCL },
-    { X86::SAR64ri,     X86::SAR64mi },
-    { X86::SAR8r1,      X86::SAR8m1 },
-    { X86::SAR8rCL,     X86::SAR8mCL },
-    { X86::SAR8ri,      X86::SAR8mi },
-    { X86::SBB32ri,     X86::SBB32mi },
-    { X86::SBB32ri8,    X86::SBB32mi8 },
-    { X86::SBB32rr,     X86::SBB32mr },
-    { X86::SBB64ri32,   X86::SBB64mi32 },
-    { X86::SBB64ri8,    X86::SBB64mi8 },
-    { X86::SBB64rr,     X86::SBB64mr },
-    { X86::SHL16rCL,    X86::SHL16mCL },
-    { X86::SHL16ri,     X86::SHL16mi },
-    { X86::SHL32rCL,    X86::SHL32mCL },
-    { X86::SHL32ri,     X86::SHL32mi },
-    { X86::SHL64rCL,    X86::SHL64mCL },
-    { X86::SHL64ri,     X86::SHL64mi },
-    { X86::SHL8rCL,     X86::SHL8mCL },
-    { X86::SHL8ri,      X86::SHL8mi },
-    { X86::SHLD16rrCL,  X86::SHLD16mrCL },
-    { X86::SHLD16rri8,  X86::SHLD16mri8 },
-    { X86::SHLD32rrCL,  X86::SHLD32mrCL },
-    { X86::SHLD32rri8,  X86::SHLD32mri8 },
-    { X86::SHLD64rrCL,  X86::SHLD64mrCL },
-    { X86::SHLD64rri8,  X86::SHLD64mri8 },
-    { X86::SHR16r1,     X86::SHR16m1 },
-    { X86::SHR16rCL,    X86::SHR16mCL },
-    { X86::SHR16ri,     X86::SHR16mi },
-    { X86::SHR32r1,     X86::SHR32m1 },
-    { X86::SHR32rCL,    X86::SHR32mCL },
-    { X86::SHR32ri,     X86::SHR32mi },
-    { X86::SHR64r1,     X86::SHR64m1 },
-    { X86::SHR64rCL,    X86::SHR64mCL },
-    { X86::SHR64ri,     X86::SHR64mi },
-    { X86::SHR8r1,      X86::SHR8m1 },
-    { X86::SHR8rCL,     X86::SHR8mCL },
-    { X86::SHR8ri,      X86::SHR8mi },
-    { X86::SHRD16rrCL,  X86::SHRD16mrCL },
-    { X86::SHRD16rri8,  X86::SHRD16mri8 },
-    { X86::SHRD32rrCL,  X86::SHRD32mrCL },
-    { X86::SHRD32rri8,  X86::SHRD32mri8 },
-    { X86::SHRD64rrCL,  X86::SHRD64mrCL },
-    { X86::SHRD64rri8,  X86::SHRD64mri8 },
-    { X86::SUB16ri,     X86::SUB16mi },
-    { X86::SUB16ri8,    X86::SUB16mi8 },
-    { X86::SUB16rr,     X86::SUB16mr },
-    { X86::SUB32ri,     X86::SUB32mi },
-    { X86::SUB32ri8,    X86::SUB32mi8 },
-    { X86::SUB32rr,     X86::SUB32mr },
-    { X86::SUB64ri32,   X86::SUB64mi32 },
-    { X86::SUB64ri8,    X86::SUB64mi8 },
-    { X86::SUB64rr,     X86::SUB64mr },
-    { X86::SUB8ri,      X86::SUB8mi },
-    { X86::SUB8rr,      X86::SUB8mr },
-    { X86::XOR16ri,     X86::XOR16mi },
-    { X86::XOR16ri8,    X86::XOR16mi8 },
-    { X86::XOR16rr,     X86::XOR16mr },
-    { X86::XOR32ri,     X86::XOR32mi },
-    { X86::XOR32ri8,    X86::XOR32mi8 },
-    { X86::XOR32rr,     X86::XOR32mr },
-    { X86::XOR64ri32,   X86::XOR64mi32 },
-    { X86::XOR64ri8,    X86::XOR64mi8 },
-    { X86::XOR64rr,     X86::XOR64mr },
-    { X86::XOR8ri,      X86::XOR8mi },
-    { X86::XOR8rr,      X86::XOR8mr }
+  static const unsigned OpTbl2Addr[][3] = {
+    { X86::ADC32ri,     X86::ADC32mi,    0 },
+    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
+    { X86::ADC32rr,     X86::ADC32mr,    0 },
+    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
+    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
+    { X86::ADC64rr,     X86::ADC64mr,    0 },
+    { X86::ADD16ri,     X86::ADD16mi,    0 },
+    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
+    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
+    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
+    { X86::ADD16rr,     X86::ADD16mr,    0 },
+    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
+    { X86::ADD32ri,     X86::ADD32mi,    0 },
+    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
+    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
+    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
+    { X86::ADD32rr,     X86::ADD32mr,    0 },
+    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
+    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
+    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
+    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
+    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
+    { X86::ADD64rr,     X86::ADD64mr,    0 },
+    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
+    { X86::ADD8ri,      X86::ADD8mi,     0 },
+    { X86::ADD8rr,      X86::ADD8mr,     0 },
+    { X86::AND16ri,     X86::AND16mi,    0 },
+    { X86::AND16ri8,    X86::AND16mi8,   0 },
+    { X86::AND16rr,     X86::AND16mr,    0 },
+    { X86::AND32ri,     X86::AND32mi,    0 },
+    { X86::AND32ri8,    X86::AND32mi8,   0 },
+    { X86::AND32rr,     X86::AND32mr,    0 },
+    { X86::AND64ri32,   X86::AND64mi32,  0 },
+    { X86::AND64ri8,    X86::AND64mi8,   0 },
+    { X86::AND64rr,     X86::AND64mr,    0 },
+    { X86::AND8ri,      X86::AND8mi,     0 },
+    { X86::AND8rr,      X86::AND8mr,     0 },
+    { X86::DEC16r,      X86::DEC16m,     0 },
+    { X86::DEC32r,      X86::DEC32m,     0 },
+    { X86::DEC64_16r,   X86::DEC64_16m,  0 },
+    { X86::DEC64_32r,   X86::DEC64_32m,  0 },
+    { X86::DEC64r,      X86::DEC64m,     0 },
+    { X86::DEC8r,       X86::DEC8m,      0 },
+    { X86::INC16r,      X86::INC16m,     0 },
+    { X86::INC32r,      X86::INC32m,     0 },
+    { X86::INC64_16r,   X86::INC64_16m,  0 },
+    { X86::INC64_32r,   X86::INC64_32m,  0 },
+    { X86::INC64r,      X86::INC64m,     0 },
+    { X86::INC8r,       X86::INC8m,      0 },
+    { X86::NEG16r,      X86::NEG16m,     0 },
+    { X86::NEG32r,      X86::NEG32m,     0 },
+    { X86::NEG64r,      X86::NEG64m,     0 },
+    { X86::NEG8r,       X86::NEG8m,      0 },
+    { X86::NOT16r,      X86::NOT16m,     0 },
+    { X86::NOT32r,      X86::NOT32m,     0 },
+    { X86::NOT64r,      X86::NOT64m,     0 },
+    { X86::NOT8r,       X86::NOT8m,      0 },
+    { X86::OR16ri,      X86::OR16mi,     0 },
+    { X86::OR16ri8,     X86::OR16mi8,    0 },
+    { X86::OR16rr,      X86::OR16mr,     0 },
+    { X86::OR32ri,      X86::OR32mi,     0 },
+    { X86::OR32ri8,     X86::OR32mi8,    0 },
+    { X86::OR32rr,      X86::OR32mr,     0 },
+    { X86::OR64ri32,    X86::OR64mi32,   0 },
+    { X86::OR64ri8,     X86::OR64mi8,    0 },
+    { X86::OR64rr,      X86::OR64mr,     0 },
+    { X86::OR8ri,       X86::OR8mi,      0 },
+    { X86::OR8rr,       X86::OR8mr,      0 },
+    { X86::ROL16r1,     X86::ROL16m1,    0 },
+    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
+    { X86::ROL16ri,     X86::ROL16mi,    0 },
+    { X86::ROL32r1,     X86::ROL32m1,    0 },
+    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
+    { X86::ROL32ri,     X86::ROL32mi,    0 },
+    { X86::ROL64r1,     X86::ROL64m1,    0 },
+    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
+    { X86::ROL64ri,     X86::ROL64mi,    0 },
+    { X86::ROL8r1,      X86::ROL8m1,     0 },
+    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
+    { X86::ROL8ri,      X86::ROL8mi,     0 },
+    { X86::ROR16r1,     X86::ROR16m1,    0 },
+    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
+    { X86::ROR16ri,     X86::ROR16mi,    0 },
+    { X86::ROR32r1,     X86::ROR32m1,    0 },
+    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
+    { X86::ROR32ri,     X86::ROR32mi,    0 },
+    { X86::ROR64r1,     X86::ROR64m1,    0 },
+    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
+    { X86::ROR64ri,     X86::ROR64mi,    0 },
+    { X86::ROR8r1,      X86::ROR8m1,     0 },
+    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
+    { X86::ROR8ri,      X86::ROR8mi,     0 },
+    { X86::SAR16r1,     X86::SAR16m1,    0 },
+    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
+    { X86::SAR16ri,     X86::SAR16mi,    0 },
+    { X86::SAR32r1,     X86::SAR32m1,    0 },
+    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
+    { X86::SAR32ri,     X86::SAR32mi,    0 },
+    { X86::SAR64r1,     X86::SAR64m1,    0 },
+    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
+    { X86::SAR64ri,     X86::SAR64mi,    0 },
+    { X86::SAR8r1,      X86::SAR8m1,     0 },
+    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
+    { X86::SAR8ri,      X86::SAR8mi,     0 },
+    { X86::SBB32ri,     X86::SBB32mi,    0 },
+    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
+    { X86::SBB32rr,     X86::SBB32mr,    0 },
+    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
+    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
+    { X86::SBB64rr,     X86::SBB64mr,    0 },
+    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
+    { X86::SHL16ri,     X86::SHL16mi,    0 },
+    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
+    { X86::SHL32ri,     X86::SHL32mi,    0 },
+    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
+    { X86::SHL64ri,     X86::SHL64mi,    0 },
+    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
+    { X86::SHL8ri,      X86::SHL8mi,     0 },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
+    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
+    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
+    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
+    { X86::SHR16r1,     X86::SHR16m1,    0 },
+    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
+    { X86::SHR16ri,     X86::SHR16mi,    0 },
+    { X86::SHR32r1,     X86::SHR32m1,    0 },
+    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
+    { X86::SHR32ri,     X86::SHR32mi,    0 },
+    { X86::SHR64r1,     X86::SHR64m1,    0 },
+    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
+    { X86::SHR64ri,     X86::SHR64mi,    0 },
+    { X86::SHR8r1,      X86::SHR8m1,     0 },
+    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
+    { X86::SHR8ri,      X86::SHR8mi,     0 },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
+    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
+    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
+    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
+    { X86::SUB16ri,     X86::SUB16mi,    0 },
+    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
+    { X86::SUB16rr,     X86::SUB16mr,    0 },
+    { X86::SUB32ri,     X86::SUB32mi,    0 },
+    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
+    { X86::SUB32rr,     X86::SUB32mr,    0 },
+    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
+    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
+    { X86::SUB64rr,     X86::SUB64mr,    0 },
+    { X86::SUB8ri,      X86::SUB8mi,     0 },
+    { X86::SUB8rr,      X86::SUB8mr,     0 },
+    { X86::XOR16ri,     X86::XOR16mi,    0 },
+    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
+    { X86::XOR16rr,     X86::XOR16mr,    0 },
+    { X86::XOR32ri,     X86::XOR32mi,    0 },
+    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
+    { X86::XOR32rr,     X86::XOR32mr,    0 },
+    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
+    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
+    { X86::XOR64rr,     X86::XOR64mr,    0 },
+    { X86::XOR8ri,      X86::XOR8mi,     0 },
+    { X86::XOR8rr,      X86::XOR8mr,     0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
     unsigned RegOp = OpTbl2Addr[i][0];
-    unsigned MemOp = OpTbl2Addr[i][1] & ~TB_FLAGS;
-    assert(!RegOp2MemOpTable2Addr.count(RegOp) && "Duplicated entries?");
-    RegOp2MemOpTable2Addr[RegOp] = std::make_pair(MemOp, 0U);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl2Addr[i][1] & TB_NOT_REVERSABLE)
-      continue;
-
-    // Index 0, folded load and store, no alignment requirement.
-    unsigned AuxInfo = 0 | (1 << 4) | (1 << 5);
-
-    assert(!MemOp2RegOpTable.count(MemOp) &&
-            "Duplicated entries in unfolding maps?");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
-  }
-
-  // If the third value is 1, then it's folding either a load or a store.
-  static const unsigned OpTbl0[][4] = {
-    { X86::BT16ri8,     X86::BT16mi8, 1, 0 },
-    { X86::BT32ri8,     X86::BT32mi8, 1, 0 },
-    { X86::BT64ri8,     X86::BT64mi8, 1, 0 },
-    { X86::CALL32r,     X86::CALL32m, 1, 0 },
-    { X86::CALL64r,     X86::CALL64m, 1, 0 },
-    { X86::WINCALL64r,  X86::WINCALL64m, 1, 0 },
-    { X86::CMP16ri,     X86::CMP16mi, 1, 0 },
-    { X86::CMP16ri8,    X86::CMP16mi8, 1, 0 },
-    { X86::CMP16rr,     X86::CMP16mr, 1, 0 },
-    { X86::CMP32ri,     X86::CMP32mi, 1, 0 },
-    { X86::CMP32ri8,    X86::CMP32mi8, 1, 0 },
-    { X86::CMP32rr,     X86::CMP32mr, 1, 0 },
-    { X86::CMP64ri32,   X86::CMP64mi32, 1, 0 },
-    { X86::CMP64ri8,    X86::CMP64mi8, 1, 0 },
-    { X86::CMP64rr,     X86::CMP64mr, 1, 0 },
-    { X86::CMP8ri,      X86::CMP8mi, 1, 0 },
-    { X86::CMP8rr,      X86::CMP8mr, 1, 0 },
-    { X86::DIV16r,      X86::DIV16m, 1, 0 },
-    { X86::DIV32r,      X86::DIV32m, 1, 0 },
-    { X86::DIV64r,      X86::DIV64m, 1, 0 },
-    { X86::DIV8r,       X86::DIV8m, 1, 0 },
-    { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0, 16 },
-    { X86::FsMOVAPDrr,  X86::MOVSDmr | TB_NOT_REVERSABLE , 0, 0 },
-    { X86::FsMOVAPSrr,  X86::MOVSSmr | TB_NOT_REVERSABLE , 0, 0 },
-    { X86::IDIV16r,     X86::IDIV16m, 1, 0 },
-    { X86::IDIV32r,     X86::IDIV32m, 1, 0 },
-    { X86::IDIV64r,     X86::IDIV64m, 1, 0 },
-    { X86::IDIV8r,      X86::IDIV8m, 1, 0 },
-    { X86::IMUL16r,     X86::IMUL16m, 1, 0 },
-    { X86::IMUL32r,     X86::IMUL32m, 1, 0 },
-    { X86::IMUL64r,     X86::IMUL64m, 1, 0 },
-    { X86::IMUL8r,      X86::IMUL8m, 1, 0 },
-    { X86::JMP32r,      X86::JMP32m, 1, 0 },
-    { X86::JMP64r,      X86::JMP64m, 1, 0 },
-    { X86::MOV16ri,     X86::MOV16mi, 0, 0 },
-    { X86::MOV16rr,     X86::MOV16mr, 0, 0 },
-    { X86::MOV32ri,     X86::MOV32mi, 0, 0 },
-    { X86::MOV32rr,     X86::MOV32mr, 0, 0 },
-    { X86::MOV64ri32,   X86::MOV64mi32, 0, 0 },
-    { X86::MOV64rr,     X86::MOV64mr, 0, 0 },
-    { X86::MOV8ri,      X86::MOV8mi, 0, 0 },
-    { X86::MOV8rr,      X86::MOV8mr, 0, 0 },
-    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0, 0 },
-    { X86::MOVAPDrr,    X86::MOVAPDmr, 0, 16 },
-    { X86::MOVAPSrr,    X86::MOVAPSmr, 0, 16 },
-    { X86::MOVDQArr,    X86::MOVDQAmr, 0, 16 },
-    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr, 0, 32 },
-    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr, 0, 32 },
-    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr, 0, 32 },
-    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
-    { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
-    { X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
-    { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0, 0 },
-    { X86::MOVUPDrr,    X86::MOVUPDmr, 0, 0 },
-    { X86::MOVUPSrr,    X86::MOVUPSmr, 0, 0 },
-    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr, 0, 0 },
-    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr, 0, 0 },
-    { X86::MUL16r,      X86::MUL16m, 1, 0 },
-    { X86::MUL32r,      X86::MUL32m, 1, 0 },
-    { X86::MUL64r,      X86::MUL64m, 1, 0 },
-    { X86::MUL8r,       X86::MUL8m, 1, 0 },
-    { X86::SETAEr,      X86::SETAEm, 0, 0 },
-    { X86::SETAr,       X86::SETAm, 0, 0 },
-    { X86::SETBEr,      X86::SETBEm, 0, 0 },
-    { X86::SETBr,       X86::SETBm, 0, 0 },
-    { X86::SETEr,       X86::SETEm, 0, 0 },
-    { X86::SETGEr,      X86::SETGEm, 0, 0 },
-    { X86::SETGr,       X86::SETGm, 0, 0 },
-    { X86::SETLEr,      X86::SETLEm, 0, 0 },
-    { X86::SETLr,       X86::SETLm, 0, 0 },
-    { X86::SETNEr,      X86::SETNEm, 0, 0 },
-    { X86::SETNOr,      X86::SETNOm, 0, 0 },
-    { X86::SETNPr,      X86::SETNPm, 0, 0 },
-    { X86::SETNSr,      X86::SETNSm, 0, 0 },
-    { X86::SETOr,       X86::SETOm, 0, 0 },
-    { X86::SETPr,       X86::SETPm, 0, 0 },
-    { X86::SETSr,       X86::SETSm, 0, 0 },
-    { X86::TAILJMPr,    X86::TAILJMPm, 1, 0 },
-    { X86::TAILJMPr64,  X86::TAILJMPm64, 1, 0 },
-    { X86::TEST16ri,    X86::TEST16mi, 1, 0 },
-    { X86::TEST32ri,    X86::TEST32mi, 1, 0 },
-    { X86::TEST64ri32,  X86::TEST64mi32, 1, 0 },
-    { X86::TEST8ri,     X86::TEST8mi, 1, 0 }
+    unsigned MemOp = OpTbl2Addr[i][1];
+    unsigned Flags = OpTbl2Addr[i][2];
+    AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 0, folded load and store, no alignment requirement.
+                  Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
+  }
+
+  static const unsigned OpTbl0[][3] = {
+    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
+    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
+    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
+    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
+    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
+    { X86::WINCALL64r,  X86::WINCALL64m,    TB_FOLDED_LOAD },
+    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
+    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
+    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
+    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
+    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
+    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
+    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
+    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
+    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
+    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
+    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
+    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
+    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
+    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
+    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::FsMOVAPDrr,  X86::MOVSDmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::FsMOVAPSrr,  X86::MOVSSmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
+    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
+    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
+    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
+    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
+    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
+    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
+    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
+    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
+    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
+    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
+    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
+    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
+    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
+    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
+    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
+    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
+    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
+    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
+    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
+    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
+    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
+    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
+    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
+    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
+    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
+    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
+    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
+    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
+    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
+    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
+    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
+    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
+    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
+    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
+    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
+    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
+    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
+    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
+    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
+    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
+    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
+    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
+    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
+    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
+    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::FsVMOVAPDrr, X86::VMOVSDmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::FsVMOVAPSrr, X86::VMOVSSmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
+    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
+    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
+    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
+    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
+    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
+    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
+    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
+    // AVX 256-bit foldable instructions
+    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
+    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
     unsigned RegOp      = OpTbl0[i][0];
-    unsigned MemOp      = OpTbl0[i][1] & ~TB_FLAGS;
-    unsigned FoldedLoad = OpTbl0[i][2];
-    unsigned Align      = OpTbl0[i][3];
-    assert(!RegOp2MemOpTable0.count(RegOp) && "Duplicated entries?");
-    RegOp2MemOpTable0[RegOp] = std::make_pair(MemOp, Align);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl0[i][1] & TB_NOT_REVERSABLE)
-      continue;
-
-    // Index 0, folded load or store.
-    unsigned AuxInfo = 0 | (FoldedLoad << 4) | ((FoldedLoad^1) << 5);
-    assert(!MemOp2RegOpTable.count(MemOp) && "Duplicated entries?");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
+    unsigned MemOp      = OpTbl0[i][1];
+    unsigned Flags      = OpTbl0[i][2];
+    AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
+                  RegOp, MemOp, TB_INDEX_0 | Flags);
   }
 
   static const unsigned OpTbl1[][3] = {
-    { X86::CMP16rr,         X86::CMP16rm, 0 },
-    { X86::CMP32rr,         X86::CMP32rm, 0 },
-    { X86::CMP64rr,         X86::CMP64rm, 0 },
-    { X86::CMP8rr,          X86::CMP8rm, 0 },
-    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm, 0 },
-    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm, 0 },
-    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm, 0 },
-    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm, 0 },
-    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm, 0 },
-    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm, 0 },
-    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm, 0 },
-    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm, 0 },
-    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm, 0 },
-    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm, 0 },
-    { X86::FsMOVAPDrr,      X86::MOVSDrm | TB_NOT_REVERSABLE , 0 },
-    { X86::FsMOVAPSrr,      X86::MOVSSrm | TB_NOT_REVERSABLE , 0 },
-    { X86::IMUL16rri,       X86::IMUL16rmi, 0 },
-    { X86::IMUL16rri8,      X86::IMUL16rmi8, 0 },
-    { X86::IMUL32rri,       X86::IMUL32rmi, 0 },
-    { X86::IMUL32rri8,      X86::IMUL32rmi8, 0 },
-    { X86::IMUL64rri32,     X86::IMUL64rmi32, 0 },
-    { X86::IMUL64rri8,      X86::IMUL64rmi8, 0 },
-    { X86::Int_COMISDrr,    X86::Int_COMISDrm, 0 },
-    { X86::Int_COMISSrr,    X86::Int_COMISSrm, 0 },
-    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm, 16 },
-    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm, 16 },
-    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm, 16 },
-    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm, 16 },
-    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm, 16 },
-    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm, 0 },
-    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm, 0 },
-    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm, 0 },
-    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm, 0 },
-    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
-    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm, 0 },
-    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
-    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm, 0 },
-    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm, 0 },
-    { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm, 0 },
-    { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm, 0 },
-    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm, 16 },
-    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm, 16 },
-    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
-    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
-    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
-    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
-    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm, 0 },
-    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm, 0 },
-    { X86::MOV16rr,         X86::MOV16rm, 0 },
-    { X86::MOV32rr,         X86::MOV32rm, 0 },
-    { X86::MOV64rr,         X86::MOV64rm, 0 },
-    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm, 0 },
-    { X86::MOV64toSDrr,     X86::MOV64toSDrm, 0 },
-    { X86::MOV8rr,          X86::MOV8rm, 0 },
-    { X86::MOVAPDrr,        X86::MOVAPDrm, 16 },
-    { X86::MOVAPSrr,        X86::MOVAPSrm, 16 },
-    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm, 32 },
-    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm, 32 },
-    { X86::MOVDDUPrr,       X86::MOVDDUPrm, 0 },
-    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm, 0 },
-    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm, 0 },
-    { X86::MOVDQArr,        X86::MOVDQArm, 16 },
-    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm, 16 },
-    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm, 16 },
-    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm, 16 },
-    { X86::MOVSX16rr8,      X86::MOVSX16rm8, 0 },
-    { X86::MOVSX32rr16,     X86::MOVSX32rm16, 0 },
-    { X86::MOVSX32rr8,      X86::MOVSX32rm8, 0 },
-    { X86::MOVSX64rr16,     X86::MOVSX64rm16, 0 },
-    { X86::MOVSX64rr32,     X86::MOVSX64rm32, 0 },
-    { X86::MOVSX64rr8,      X86::MOVSX64rm8, 0 },
-    { X86::MOVUPDrr,        X86::MOVUPDrm, 16 },
-    { X86::MOVUPSrr,        X86::MOVUPSrm, 0 },
-    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm, 0 },
-    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm, 0 },
-    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm, 0 },
-    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm, 0 },
-    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
-    { X86::MOVZX16rr8,      X86::MOVZX16rm8, 0 },
-    { X86::MOVZX32rr16,     X86::MOVZX32rm16, 0 },
-    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
-    { X86::MOVZX32rr8,      X86::MOVZX32rm8, 0 },
-    { X86::MOVZX64rr16,     X86::MOVZX64rm16, 0 },
-    { X86::MOVZX64rr32,     X86::MOVZX64rm32, 0 },
-    { X86::MOVZX64rr8,      X86::MOVZX64rm8, 0 },
-    { X86::PSHUFDri,        X86::PSHUFDmi, 16 },
-    { X86::PSHUFHWri,       X86::PSHUFHWmi, 16 },
-    { X86::PSHUFLWri,       X86::PSHUFLWmi, 16 },
-    { X86::RCPPSr,          X86::RCPPSm, 16 },
-    { X86::RCPPSr_Int,      X86::RCPPSm_Int, 16 },
-    { X86::RSQRTPSr,        X86::RSQRTPSm, 16 },
-    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int, 16 },
-    { X86::RSQRTSSr,        X86::RSQRTSSm, 0 },
-    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int, 0 },
-    { X86::SQRTPDr,         X86::SQRTPDm, 16 },
-    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int, 16 },
-    { X86::SQRTPSr,         X86::SQRTPSm, 16 },
-    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int, 16 },
-    { X86::SQRTSDr,         X86::SQRTSDm, 0 },
-    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int, 0 },
-    { X86::SQRTSSr,         X86::SQRTSSm, 0 },
-    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int, 0 },
-    { X86::TEST16rr,        X86::TEST16rm, 0 },
-    { X86::TEST32rr,        X86::TEST32rm, 0 },
-    { X86::TEST64rr,        X86::TEST64rm, 0 },
-    { X86::TEST8rr,         X86::TEST8rm, 0 },
+    { X86::CMP16rr,         X86::CMP16rm,             0 },
+    { X86::CMP32rr,         X86::CMP32rm,             0 },
+    { X86::CMP64rr,         X86::CMP64rm,             0 },
+    { X86::CMP8rr,          X86::CMP8rm,              0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
+    { X86::FsMOVAPDrr,      X86::MOVSDrm,             TB_NO_REVERSE },
+    { X86::FsMOVAPSrr,      X86::MOVSSrm,             TB_NO_REVERSE },
+    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
+    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm,      TB_ALIGN_16 },
+    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm,      TB_ALIGN_16 },
+    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm,      0 },
+    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
+    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
+    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
+    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     0 },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  0 },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     0 },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       0 },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       0 },
+    { X86::MOV16rr,         X86::MOV16rm,             0 },
+    { X86::MOV32rr,         X86::MOV32rm,             0 },
+    { X86::MOV64rr,         X86::MOV64rm,             0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
+    { X86::MOV8rr,          X86::MOV8rm,              0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
+    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
+    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm,        0 },
+    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
+    { X86::MOVZX64rr16,     X86::MOVZX64rm16,         0 },
+    { X86::MOVZX64rr32,     X86::MOVZX64rm32,         0 },
+    { X86::MOVZX64rr8,      X86::MOVZX64rm8,          0 },
+    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
+    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
+    { X86::RCPPSr_Int,      X86::RCPPSm_Int,          TB_ALIGN_16 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
+    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int,        TB_ALIGN_16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        0 },
+    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
+    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int,         TB_ALIGN_16 },
+    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
+    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int,         TB_ALIGN_16 },
+    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         0 },
+    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         0 },
+    { X86::TEST16rr,        X86::TEST16rm,            0 },
+    { X86::TEST32rr,        X86::TEST32rm,            0 },
+    { X86::TEST64rr,        X86::TEST64rm,            0 },
+    { X86::TEST8rr,         X86::TEST8rm,             0 },
     // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
-    { X86::UCOMISDrr,       X86::UCOMISDrm, 0 },
-    { X86::UCOMISSrr,       X86::UCOMISSrm, 0 }
+    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
+    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
+    { X86::Int_VCVTDQ2PDrr, X86::Int_VCVTDQ2PDrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTDQ2PSrr, X86::Int_VCVTDQ2PSrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPD2DQrr, X86::Int_VCVTPD2DQrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPD2PSrr, X86::Int_VCVTPD2PSrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPS2DQrr, X86::Int_VCVTPS2DQrm,     TB_ALIGN_16 },
+    { X86::Int_VCVTPS2PDrr, X86::Int_VCVTPS2PDrm,     0 },
+    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
+    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
+    { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
+    { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
+    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
+    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
+    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
+    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
+    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
+    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
+    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
+    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
+    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
+    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           TB_ALIGN_16 },
+    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
+    { X86::VMOVZDI2PDIrr,   X86::VMOVZDI2PDIrm,       0 },
+    { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
+    { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
+    { X86::VPSHUFDri,       X86::VPSHUFDmi,           TB_ALIGN_16 },
+    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          TB_ALIGN_16 },
+    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          TB_ALIGN_16 },
+    { X86::VRCPPSr,         X86::VRCPPSm,             TB_ALIGN_16 },
+    { X86::VRCPPSr_Int,     X86::VRCPPSm_Int,         TB_ALIGN_16 },
+    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           TB_ALIGN_16 },
+    { X86::VRSQRTPSr_Int,   X86::VRSQRTPSm_Int,       TB_ALIGN_16 },
+    { X86::VSQRTPDr,        X86::VSQRTPDm,            TB_ALIGN_16 },
+    { X86::VSQRTPDr_Int,    X86::VSQRTPDm_Int,        TB_ALIGN_16 },
+    { X86::VSQRTPSr,        X86::VSQRTPSm,            TB_ALIGN_16 },
+    { X86::VSQRTPSr_Int,    X86::VSQRTPSm_Int,        TB_ALIGN_16 },
+    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
+    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
+    // AVX 256-bit foldable instructions
+    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
+    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
+    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_16 },
+    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
+    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
     unsigned RegOp = OpTbl1[i][0];
-    unsigned MemOp = OpTbl1[i][1] & ~TB_FLAGS;
-    unsigned Align = OpTbl1[i][2];
-    assert(!RegOp2MemOpTable1.count(RegOp) && "Duplicate entries");
-    RegOp2MemOpTable1[RegOp] = std::make_pair(MemOp, Align);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl1[i][1] & TB_NOT_REVERSABLE)
-      continue;
-
-    // Index 1, folded load
-    unsigned AuxInfo = 1 | (1 << 4);
-    assert(!MemOp2RegOpTable.count(MemOp) && "Duplicate entries");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
+    unsigned MemOp = OpTbl1[i][1];
+    unsigned Flags = OpTbl1[i][2];
+    AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 1, folded load
+                  Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
   static const unsigned OpTbl2[][3] = {
-    { X86::ADC32rr,         X86::ADC32rm, 0 },
-    { X86::ADC64rr,         X86::ADC64rm, 0 },
-    { X86::ADD16rr,         X86::ADD16rm, 0 },
-    { X86::ADD16rr_DB,      X86::ADD16rm | TB_NOT_REVERSABLE, 0 },
-    { X86::ADD32rr,         X86::ADD32rm, 0 },
-    { X86::ADD32rr_DB,      X86::ADD32rm | TB_NOT_REVERSABLE, 0 },
-    { X86::ADD64rr,         X86::ADD64rm, 0 },
-    { X86::ADD64rr_DB,      X86::ADD64rm | TB_NOT_REVERSABLE, 0 },
-    { X86::ADD8rr,          X86::ADD8rm, 0 },
-    { X86::ADDPDrr,         X86::ADDPDrm, 16 },
-    { X86::ADDPSrr,         X86::ADDPSrm, 16 },
-    { X86::ADDSDrr,         X86::ADDSDrm, 0 },
-    { X86::ADDSSrr,         X86::ADDSSrm, 0 },
-    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm, 16 },
-    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm, 16 },
-    { X86::AND16rr,         X86::AND16rm, 0 },
-    { X86::AND32rr,         X86::AND32rm, 0 },
-    { X86::AND64rr,         X86::AND64rm, 0 },
-    { X86::AND8rr,          X86::AND8rm, 0 },
-    { X86::ANDNPDrr,        X86::ANDNPDrm, 16 },
-    { X86::ANDNPSrr,        X86::ANDNPSrm, 16 },
-    { X86::ANDPDrr,         X86::ANDPDrm, 16 },
-    { X86::ANDPSrr,         X86::ANDPSrm, 16 },
-    { X86::CMOVA16rr,       X86::CMOVA16rm, 0 },
-    { X86::CMOVA32rr,       X86::CMOVA32rm, 0 },
-    { X86::CMOVA64rr,       X86::CMOVA64rm, 0 },
-    { X86::CMOVAE16rr,      X86::CMOVAE16rm, 0 },
-    { X86::CMOVAE32rr,      X86::CMOVAE32rm, 0 },
-    { X86::CMOVAE64rr,      X86::CMOVAE64rm, 0 },
-    { X86::CMOVB16rr,       X86::CMOVB16rm, 0 },
-    { X86::CMOVB32rr,       X86::CMOVB32rm, 0 },
-    { X86::CMOVB64rr,       X86::CMOVB64rm, 0 },
-    { X86::CMOVBE16rr,      X86::CMOVBE16rm, 0 },
-    { X86::CMOVBE32rr,      X86::CMOVBE32rm, 0 },
-    { X86::CMOVBE64rr,      X86::CMOVBE64rm, 0 },
-    { X86::CMOVE16rr,       X86::CMOVE16rm, 0 },
-    { X86::CMOVE32rr,       X86::CMOVE32rm, 0 },
-    { X86::CMOVE64rr,       X86::CMOVE64rm, 0 },
-    { X86::CMOVG16rr,       X86::CMOVG16rm, 0 },
-    { X86::CMOVG32rr,       X86::CMOVG32rm, 0 },
-    { X86::CMOVG64rr,       X86::CMOVG64rm, 0 },
-    { X86::CMOVGE16rr,      X86::CMOVGE16rm, 0 },
-    { X86::CMOVGE32rr,      X86::CMOVGE32rm, 0 },
-    { X86::CMOVGE64rr,      X86::CMOVGE64rm, 0 },
-    { X86::CMOVL16rr,       X86::CMOVL16rm, 0 },
-    { X86::CMOVL32rr,       X86::CMOVL32rm, 0 },
-    { X86::CMOVL64rr,       X86::CMOVL64rm, 0 },
-    { X86::CMOVLE16rr,      X86::CMOVLE16rm, 0 },
-    { X86::CMOVLE32rr,      X86::CMOVLE32rm, 0 },
-    { X86::CMOVLE64rr,      X86::CMOVLE64rm, 0 },
-    { X86::CMOVNE16rr,      X86::CMOVNE16rm, 0 },
-    { X86::CMOVNE32rr,      X86::CMOVNE32rm, 0 },
-    { X86::CMOVNE64rr,      X86::CMOVNE64rm, 0 },
-    { X86::CMOVNO16rr,      X86::CMOVNO16rm, 0 },
-    { X86::CMOVNO32rr,      X86::CMOVNO32rm, 0 },
-    { X86::CMOVNO64rr,      X86::CMOVNO64rm, 0 },
-    { X86::CMOVNP16rr,      X86::CMOVNP16rm, 0 },
-    { X86::CMOVNP32rr,      X86::CMOVNP32rm, 0 },
-    { X86::CMOVNP64rr,      X86::CMOVNP64rm, 0 },
-    { X86::CMOVNS16rr,      X86::CMOVNS16rm, 0 },
-    { X86::CMOVNS32rr,      X86::CMOVNS32rm, 0 },
-    { X86::CMOVNS64rr,      X86::CMOVNS64rm, 0 },
-    { X86::CMOVO16rr,       X86::CMOVO16rm, 0 },
-    { X86::CMOVO32rr,       X86::CMOVO32rm, 0 },
-    { X86::CMOVO64rr,       X86::CMOVO64rm, 0 },
-    { X86::CMOVP16rr,       X86::CMOVP16rm, 0 },
-    { X86::CMOVP32rr,       X86::CMOVP32rm, 0 },
-    { X86::CMOVP64rr,       X86::CMOVP64rm, 0 },
-    { X86::CMOVS16rr,       X86::CMOVS16rm, 0 },
-    { X86::CMOVS32rr,       X86::CMOVS32rm, 0 },
-    { X86::CMOVS64rr,       X86::CMOVS64rm, 0 },
-    { X86::CMPPDrri,        X86::CMPPDrmi, 16 },
-    { X86::CMPPSrri,        X86::CMPPSrmi, 16 },
-    { X86::CMPSDrr,         X86::CMPSDrm, 0 },
-    { X86::CMPSSrr,         X86::CMPSSrm, 0 },
-    { X86::DIVPDrr,         X86::DIVPDrm, 16 },
-    { X86::DIVPSrr,         X86::DIVPSrm, 16 },
-    { X86::DIVSDrr,         X86::DIVSDrm, 0 },
-    { X86::DIVSSrr,         X86::DIVSSrm, 0 },
-    { X86::FsANDNPDrr,      X86::FsANDNPDrm, 16 },
-    { X86::FsANDNPSrr,      X86::FsANDNPSrm, 16 },
-    { X86::FsANDPDrr,       X86::FsANDPDrm, 16 },
-    { X86::FsANDPSrr,       X86::FsANDPSrm, 16 },
-    { X86::FsORPDrr,        X86::FsORPDrm, 16 },
-    { X86::FsORPSrr,        X86::FsORPSrm, 16 },
-    { X86::FsXORPDrr,       X86::FsXORPDrm, 16 },
-    { X86::FsXORPSrr,       X86::FsXORPSrm, 16 },
-    { X86::HADDPDrr,        X86::HADDPDrm, 16 },
-    { X86::HADDPSrr,        X86::HADDPSrm, 16 },
-    { X86::HSUBPDrr,        X86::HSUBPDrm, 16 },
-    { X86::HSUBPSrr,        X86::HSUBPSrm, 16 },
-    { X86::IMUL16rr,        X86::IMUL16rm, 0 },
-    { X86::IMUL32rr,        X86::IMUL32rm, 0 },
-    { X86::IMUL64rr,        X86::IMUL64rm, 0 },
-    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm, 0 },
-    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm, 0 },
-    { X86::MAXPDrr,         X86::MAXPDrm, 16 },
-    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int, 16 },
-    { X86::MAXPSrr,         X86::MAXPSrm, 16 },
-    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int, 16 },
-    { X86::MAXSDrr,         X86::MAXSDrm, 0 },
-    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int, 0 },
-    { X86::MAXSSrr,         X86::MAXSSrm, 0 },
-    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int, 0 },
-    { X86::MINPDrr,         X86::MINPDrm, 16 },
-    { X86::MINPDrr_Int,     X86::MINPDrm_Int, 16 },
-    { X86::MINPSrr,         X86::MINPSrm, 16 },
-    { X86::MINPSrr_Int,     X86::MINPSrm_Int, 16 },
-    { X86::MINSDrr,         X86::MINSDrm, 0 },
-    { X86::MINSDrr_Int,     X86::MINSDrm_Int, 0 },
-    { X86::MINSSrr,         X86::MINSSrm, 0 },
-    { X86::MINSSrr_Int,     X86::MINSSrm_Int, 0 },
-    { X86::MULPDrr,         X86::MULPDrm, 16 },
-    { X86::MULPSrr,         X86::MULPSrm, 16 },
-    { X86::MULSDrr,         X86::MULSDrm, 0 },
-    { X86::MULSSrr,         X86::MULSSrm, 0 },
-    { X86::OR16rr,          X86::OR16rm, 0 },
-    { X86::OR32rr,          X86::OR32rm, 0 },
-    { X86::OR64rr,          X86::OR64rm, 0 },
-    { X86::OR8rr,           X86::OR8rm, 0 },
-    { X86::ORPDrr,          X86::ORPDrm, 16 },
-    { X86::ORPSrr,          X86::ORPSrm, 16 },
-    { X86::PACKSSDWrr,      X86::PACKSSDWrm, 16 },
-    { X86::PACKSSWBrr,      X86::PACKSSWBrm, 16 },
-    { X86::PACKUSWBrr,      X86::PACKUSWBrm, 16 },
-    { X86::PADDBrr,         X86::PADDBrm, 16 },
-    { X86::PADDDrr,         X86::PADDDrm, 16 },
-    { X86::PADDQrr,         X86::PADDQrm, 16 },
-    { X86::PADDSBrr,        X86::PADDSBrm, 16 },
-    { X86::PADDSWrr,        X86::PADDSWrm, 16 },
-    { X86::PADDWrr,         X86::PADDWrm, 16 },
-    { X86::PANDNrr,         X86::PANDNrm, 16 },
-    { X86::PANDrr,          X86::PANDrm, 16 },
-    { X86::PAVGBrr,         X86::PAVGBrm, 16 },
-    { X86::PAVGWrr,         X86::PAVGWrm, 16 },
-    { X86::PCMPEQBrr,       X86::PCMPEQBrm, 16 },
-    { X86::PCMPEQDrr,       X86::PCMPEQDrm, 16 },
-    { X86::PCMPEQWrr,       X86::PCMPEQWrm, 16 },
-    { X86::PCMPGTBrr,       X86::PCMPGTBrm, 16 },
-    { X86::PCMPGTDrr,       X86::PCMPGTDrm, 16 },
-    { X86::PCMPGTWrr,       X86::PCMPGTWrm, 16 },
-    { X86::PINSRWrri,       X86::PINSRWrmi, 16 },
-    { X86::PMADDWDrr,       X86::PMADDWDrm, 16 },
-    { X86::PMAXSWrr,        X86::PMAXSWrm, 16 },
-    { X86::PMAXUBrr,        X86::PMAXUBrm, 16 },
-    { X86::PMINSWrr,        X86::PMINSWrm, 16 },
-    { X86::PMINUBrr,        X86::PMINUBrm, 16 },
-    { X86::PMULDQrr,        X86::PMULDQrm, 16 },
-    { X86::PMULHUWrr,       X86::PMULHUWrm, 16 },
-    { X86::PMULHWrr,        X86::PMULHWrm, 16 },
-    { X86::PMULLDrr,        X86::PMULLDrm, 16 },
-    { X86::PMULLWrr,        X86::PMULLWrm, 16 },
-    { X86::PMULUDQrr,       X86::PMULUDQrm, 16 },
-    { X86::PORrr,           X86::PORrm, 16 },
-    { X86::PSADBWrr,        X86::PSADBWrm, 16 },
-    { X86::PSLLDrr,         X86::PSLLDrm, 16 },
-    { X86::PSLLQrr,         X86::PSLLQrm, 16 },
-    { X86::PSLLWrr,         X86::PSLLWrm, 16 },
-    { X86::PSRADrr,         X86::PSRADrm, 16 },
-    { X86::PSRAWrr,         X86::PSRAWrm, 16 },
-    { X86::PSRLDrr,         X86::PSRLDrm, 16 },
-    { X86::PSRLQrr,         X86::PSRLQrm, 16 },
-    { X86::PSRLWrr,         X86::PSRLWrm, 16 },
-    { X86::PSUBBrr,         X86::PSUBBrm, 16 },
-    { X86::PSUBDrr,         X86::PSUBDrm, 16 },
-    { X86::PSUBSBrr,        X86::PSUBSBrm, 16 },
-    { X86::PSUBSWrr,        X86::PSUBSWrm, 16 },
-    { X86::PSUBWrr,         X86::PSUBWrm, 16 },
-    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm, 16 },
-    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm, 16 },
-    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm, 16 },
-    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm, 16 },
-    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm, 16 },
-    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm, 16 },
-    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm, 16 },
-    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm, 16 },
-    { X86::PXORrr,          X86::PXORrm, 16 },
-    { X86::SBB32rr,         X86::SBB32rm, 0 },
-    { X86::SBB64rr,         X86::SBB64rm, 0 },
-    { X86::SHUFPDrri,       X86::SHUFPDrmi, 16 },
-    { X86::SHUFPSrri,       X86::SHUFPSrmi, 16 },
-    { X86::SUB16rr,         X86::SUB16rm, 0 },
-    { X86::SUB32rr,         X86::SUB32rm, 0 },
-    { X86::SUB64rr,         X86::SUB64rm, 0 },
-    { X86::SUB8rr,          X86::SUB8rm, 0 },
-    { X86::SUBPDrr,         X86::SUBPDrm, 16 },
-    { X86::SUBPSrr,         X86::SUBPSrm, 16 },
-    { X86::SUBSDrr,         X86::SUBSDrm, 0 },
-    { X86::SUBSSrr,         X86::SUBSSrm, 0 },
+    { X86::ADC32rr,         X86::ADC32rm,       0 },
+    { X86::ADC64rr,         X86::ADC64rm,       0 },
+    { X86::ADD16rr,         X86::ADD16rm,       0 },
+    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
+    { X86::ADD32rr,         X86::ADD32rm,       0 },
+    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
+    { X86::ADD64rr,         X86::ADD64rm,       0 },
+    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
+    { X86::ADD8rr,          X86::ADD8rm,        0 },
+    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
+    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
+    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
+    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
+    { X86::AND16rr,         X86::AND16rm,       0 },
+    { X86::AND32rr,         X86::AND32rm,       0 },
+    { X86::AND64rr,         X86::AND64rm,       0 },
+    { X86::AND8rr,          X86::AND8rm,        0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
+    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
+    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
+    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
+    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
+    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
+    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
+    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
+    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
+    { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
+    { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
+    { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
+    { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
+    { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
+    { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
+    { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
+    { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
+    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
+    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
+    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
+    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
+    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
+    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
+    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int,   TB_ALIGN_16 },
+    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
+    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int,   TB_ALIGN_16 },
+    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int,   0 },
+    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int,   0 },
+    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
+    { X86::MINPDrr_Int,     X86::MINPDrm_Int,   TB_ALIGN_16 },
+    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
+    { X86::MINPSrr_Int,     X86::MINPSrm_Int,   TB_ALIGN_16 },
+    { X86::MINSDrr,         X86::MINSDrm,       0 },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int,   0 },
+    { X86::MINSSrr,         X86::MINSSrm,       0 },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int,   0 },
+    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
+    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
+    { X86::MULSDrr,         X86::MULSDrm,       0 },
+    { X86::MULSSrr,         X86::MULSSrm,       0 },
+    { X86::OR16rr,          X86::OR16rm,        0 },
+    { X86::OR32rr,          X86::OR32rm,        0 },
+    { X86::OR64rr,          X86::OR64rm,        0 },
+    { X86::OR8rr,           X86::OR8rm,         0 },
+    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
+    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
+    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
+    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
+    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
+    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
+    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
+    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
+    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
+    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
+    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
+    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
+    { X86::PINSRWrri,       X86::PINSRWrmi,     TB_ALIGN_16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
+    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
+    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
+    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
+    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
+    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
+    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
+    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
+    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
+    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
+    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
+    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
+    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
+    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
+    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
+    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
+    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
+    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
+    { X86::SBB32rr,         X86::SBB32rm,       0 },
+    { X86::SBB64rr,         X86::SBB64rm,       0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
+    { X86::SUB16rr,         X86::SUB16rm,       0 },
+    { X86::SUB32rr,         X86::SUB32rm,       0 },
+    { X86::SUB64rr,         X86::SUB64rm,       0 },
+    { X86::SUB8rr,          X86::SUB8rm,        0 },
+    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
+    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
+    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
+    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
     // FIXME: TEST*rr -> swapped operand of TEST*mr.
-    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm, 16 },
-    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm, 16 },
-    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm, 16 },
-    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm, 16 },
-    { X86::XOR16rr,         X86::XOR16rm, 0 },
-    { X86::XOR32rr,         X86::XOR32rm, 0 },
-    { X86::XOR64rr,         X86::XOR64rm, 0 },
-    { X86::XOR8rr,          X86::XOR8rm, 0 },
-    { X86::XORPDrr,         X86::XORPDrm, 16 },
-    { X86::XORPSrr,         X86::XORPSrm, 16 }
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
+    { X86::XOR16rr,         X86::XOR16rm,       0 },
+    { X86::XOR32rr,         X86::XOR32rm,       0 },
+    { X86::XOR64rr,         X86::XOR64rm,       0 },
+    { X86::XOR8rr,          X86::XOR8rm,        0 },
+    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
+    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
+    // AVX 128-bit versions of foldable instructions
+    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
+    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
+    { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
+    { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
+    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
+    { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
+    { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
+    { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
+    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
+    { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
+    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
+    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
+    { X86::VCVTTSD2SI64rr,    X86::VCVTTSD2SI64rm,     0 },
+    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm, 0 },
+    { X86::VCVTTSD2SIrr,      X86::VCVTTSD2SIrm,       0 },
+    { X86::Int_VCVTTSD2SIrr,  X86::Int_VCVTTSD2SIrm,   0 },
+    { X86::VCVTTSS2SI64rr,    X86::VCVTTSS2SI64rm,     0 },
+    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm, 0 },
+    { X86::VCVTTSS2SIrr,      X86::VCVTTSS2SIrm,       0 },
+    { X86::Int_VCVTTSS2SIrr,  X86::Int_VCVTTSS2SIrm,   0 },
+    { X86::VCVTSD2SI64rr,     X86::VCVTSD2SI64rm,      0 },
+    { X86::VCVTSD2SIrr,       X86::VCVTSD2SIrm,        0 },
+    { X86::VCVTTPD2DQrr,      X86::VCVTTPD2DQrm,       TB_ALIGN_16 },
+    { X86::VCVTTPS2DQrr,      X86::VCVTTPS2DQrm,       TB_ALIGN_16 },
+    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
+    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
+    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
+    { X86::VADDPDrr,          X86::VADDPDrm,           TB_ALIGN_16 },
+    { X86::VADDPSrr,          X86::VADDPSrm,           TB_ALIGN_16 },
+    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
+    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
+    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        TB_ALIGN_16 },
+    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        TB_ALIGN_16 },
+    { X86::VANDNPDrr,         X86::VANDNPDrm,          TB_ALIGN_16 },
+    { X86::VANDNPSrr,         X86::VANDNPSrm,          TB_ALIGN_16 },
+    { X86::VANDPDrr,          X86::VANDPDrm,           TB_ALIGN_16 },
+    { X86::VANDPSrr,          X86::VANDPSrm,           TB_ALIGN_16 },
+    { X86::VCMPPDrri,         X86::VCMPPDrmi,          TB_ALIGN_16 },
+    { X86::VCMPPSrri,         X86::VCMPPSrmi,          TB_ALIGN_16 },
+    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
+    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
+    { X86::VDIVPDrr,          X86::VDIVPDrm,           TB_ALIGN_16 },
+    { X86::VDIVPSrr,          X86::VDIVPSrm,           TB_ALIGN_16 },
+    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
+    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
+    { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        TB_ALIGN_16 },
+    { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        TB_ALIGN_16 },
+    { X86::VFsANDPDrr,        X86::VFsANDPDrm,         TB_ALIGN_16 },
+    { X86::VFsANDPSrr,        X86::VFsANDPSrm,         TB_ALIGN_16 },
+    { X86::VFsORPDrr,         X86::VFsORPDrm,          TB_ALIGN_16 },
+    { X86::VFsORPSrr,         X86::VFsORPSrm,          TB_ALIGN_16 },
+    { X86::VFsXORPDrr,        X86::VFsXORPDrm,         TB_ALIGN_16 },
+    { X86::VFsXORPSrr,        X86::VFsXORPSrm,         TB_ALIGN_16 },
+    { X86::VHADDPDrr,         X86::VHADDPDrm,          TB_ALIGN_16 },
+    { X86::VHADDPSrr,         X86::VHADDPSrm,          TB_ALIGN_16 },
+    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          TB_ALIGN_16 },
+    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          TB_ALIGN_16 },
+    { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       0 },
+    { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       0 },
+    { X86::VMAXPDrr,          X86::VMAXPDrm,           TB_ALIGN_16 },
+    { X86::VMAXPDrr_Int,      X86::VMAXPDrm_Int,       TB_ALIGN_16 },
+    { X86::VMAXPSrr,          X86::VMAXPSrm,           TB_ALIGN_16 },
+    { X86::VMAXPSrr_Int,      X86::VMAXPSrm_Int,       TB_ALIGN_16 },
+    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
+    { X86::VMAXSDrr_Int,      X86::VMAXSDrm_Int,       0 },
+    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
+    { X86::VMAXSSrr_Int,      X86::VMAXSSrm_Int,       0 },
+    { X86::VMINPDrr,          X86::VMINPDrm,           TB_ALIGN_16 },
+    { X86::VMINPDrr_Int,      X86::VMINPDrm_Int,       TB_ALIGN_16 },
+    { X86::VMINPSrr,          X86::VMINPSrm,           TB_ALIGN_16 },
+    { X86::VMINPSrr_Int,      X86::VMINPSrm_Int,       TB_ALIGN_16 },
+    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
+    { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       0 },
+    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
+    { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       0 },
+    { X86::VMULPDrr,          X86::VMULPDrm,           TB_ALIGN_16 },
+    { X86::VMULPSrr,          X86::VMULPSrm,           TB_ALIGN_16 },
+    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
+    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
+    { X86::VORPDrr,           X86::VORPDrm,            TB_ALIGN_16 },
+    { X86::VORPSrr,           X86::VORPSrm,            TB_ALIGN_16 },
+    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        TB_ALIGN_16 },
+    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        TB_ALIGN_16 },
+    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        TB_ALIGN_16 },
+    { X86::VPADDBrr,          X86::VPADDBrm,           TB_ALIGN_16 },
+    { X86::VPADDDrr,          X86::VPADDDrm,           TB_ALIGN_16 },
+    { X86::VPADDQrr,          X86::VPADDQrm,           TB_ALIGN_16 },
+    { X86::VPADDSBrr,         X86::VPADDSBrm,          TB_ALIGN_16 },
+    { X86::VPADDSWrr,         X86::VPADDSWrm,          TB_ALIGN_16 },
+    { X86::VPADDWrr,          X86::VPADDWrm,           TB_ALIGN_16 },
+    { X86::VPANDNrr,          X86::VPANDNrm,           TB_ALIGN_16 },
+    { X86::VPANDrr,           X86::VPANDrm,            TB_ALIGN_16 },
+    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         TB_ALIGN_16 },
+    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         TB_ALIGN_16 },
+    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         TB_ALIGN_16 },
+    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         TB_ALIGN_16 },
+    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         TB_ALIGN_16 },
+    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         TB_ALIGN_16 },
+    { X86::VPINSRWrri,        X86::VPINSRWrmi,         TB_ALIGN_16 },
+    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         TB_ALIGN_16 },
+    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          TB_ALIGN_16 },
+    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          TB_ALIGN_16 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          TB_ALIGN_16 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          TB_ALIGN_16 },
+    { X86::VPMULDQrr,         X86::VPMULDQrm,          TB_ALIGN_16 },
+    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         TB_ALIGN_16 },
+    { X86::VPMULHWrr,         X86::VPMULHWrm,          TB_ALIGN_16 },
+    { X86::VPMULLDrr,         X86::VPMULLDrm,          TB_ALIGN_16 },
+    { X86::VPMULLWrr,         X86::VPMULLWrm,          TB_ALIGN_16 },
+    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         TB_ALIGN_16 },
+    { X86::VPORrr,            X86::VPORrm,             TB_ALIGN_16 },
+    { X86::VPSADBWrr,         X86::VPSADBWrm,          TB_ALIGN_16 },
+    { X86::VPSLLDrr,          X86::VPSLLDrm,           TB_ALIGN_16 },
+    { X86::VPSLLQrr,          X86::VPSLLQrm,           TB_ALIGN_16 },
+    { X86::VPSLLWrr,          X86::VPSLLWrm,           TB_ALIGN_16 },
+    { X86::VPSRADrr,          X86::VPSRADrm,           TB_ALIGN_16 },
+    { X86::VPSRAWrr,          X86::VPSRAWrm,           TB_ALIGN_16 },
+    { X86::VPSRLDrr,          X86::VPSRLDrm,           TB_ALIGN_16 },
+    { X86::VPSRLQrr,          X86::VPSRLQrm,           TB_ALIGN_16 },
+    { X86::VPSRLWrr,          X86::VPSRLWrm,           TB_ALIGN_16 },
+    { X86::VPSUBBrr,          X86::VPSUBBrm,           TB_ALIGN_16 },
+    { X86::VPSUBDrr,          X86::VPSUBDrm,           TB_ALIGN_16 },
+    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          TB_ALIGN_16 },
+    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          TB_ALIGN_16 },
+    { X86::VPSUBWrr,          X86::VPSUBWrm,           TB_ALIGN_16 },
+    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      TB_ALIGN_16 },
+    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       TB_ALIGN_16 },
+    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      TB_ALIGN_16 },
+    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       TB_ALIGN_16 },
+    { X86::VPXORrr,           X86::VPXORrm,            TB_ALIGN_16 },
+    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         TB_ALIGN_16 },
+    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         TB_ALIGN_16 },
+    { X86::VSUBPDrr,          X86::VSUBPDrm,           TB_ALIGN_16 },
+    { X86::VSUBPSrr,          X86::VSUBPSrm,           TB_ALIGN_16 },
+    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
+    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
+    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        TB_ALIGN_16 },
+    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        TB_ALIGN_16 },
+    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        TB_ALIGN_16 },
+    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        TB_ALIGN_16 },
+    { X86::VXORPDrr,          X86::VXORPDrm,           TB_ALIGN_16 },
+    { X86::VXORPSrr,          X86::VXORPSrm,           TB_ALIGN_16 }
+    // FIXME: add AVX 256-bit foldable instructions
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
     unsigned RegOp = OpTbl2[i][0];
-    unsigned MemOp = OpTbl2[i][1] & ~TB_FLAGS;
-    unsigned Align = OpTbl2[i][2];
-
-    assert(!RegOp2MemOpTable2.count(RegOp) && "Duplicate entry!");
-    RegOp2MemOpTable2[RegOp] = std::make_pair(MemOp, Align);
-
-    // If this is not a reversible operation (because there is a many->one)
-    // mapping, don't insert the reverse of the operation into MemOp2RegOpTable.
-    if (OpTbl2[i][1] & TB_NOT_REVERSABLE)
-      continue;
+    unsigned MemOp = OpTbl2[i][1];
+    unsigned Flags = OpTbl2[i][2];
+    AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
+                  RegOp, MemOp,
+                  // Index 2, folded load
+                  Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
+  }
+}
 
-    // Index 2, folded load
-    unsigned AuxInfo = 2 | (1 << 4);
-    assert(!MemOp2RegOpTable.count(MemOp) &&
+void
+X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                            MemOp2RegOpTableType &M2RTable,
+                            unsigned RegOp, unsigned MemOp, unsigned Flags) {
+    if ((Flags & TB_NO_FORWARD) == 0) {
+      assert(!R2MTable.count(RegOp) && "Duplicate entry!");
+      R2MTable[RegOp] = std::make_pair(MemOp, Flags);
+    }
+    if ((Flags & TB_NO_REVERSE) == 0) {
+      assert(!M2RTable.count(MemOp) &&
            "Duplicated entries in unfolding maps?");
-    MemOp2RegOpTable[MemOp] = std::make_pair(RegOp, AuxInfo);
-  }
+      M2RTable[MemOp] = std::make_pair(RegOp, Flags);
+    }
 }
 
 bool
@@ -796,6 +1000,11 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::MOVAPSrm:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
   case X86::VMOVAPSYrm:
   case X86::VMOVAPDYrm:
   case X86::VMOVDQAYrm:
@@ -820,6 +1029,11 @@ static bool isFrameStoreOpcode(int Opcode) {
   case X86::MOVAPSmr:
   case X86::MOVAPDmr:
   case X86::MOVDQAmr:
+  case X86::VMOVSSmr:
+  case X86::VMOVSDmr:
+  case X86::VMOVAPSmr:
+  case X86::VMOVAPDmr:
+  case X86::VMOVDQAmr:
   case X86::VMOVAPSYmr:
   case X86::VMOVAPDYmr:
   case X86::VMOVDQAYmr:
@@ -852,24 +1066,6 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
   return 0;
 }
 
-bool X86InstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
-                                        const MachineMemOperand *&MMO,
-                                        int &FrameIndex) const {
-  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
-         oe = MI->memoperands_end();
-       o != oe;
-       ++o) {
-    if ((*o)->isLoad() && (*o)->getValue())
-      if (const FixedStackPseudoSourceValue *Value =
-          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
-        FrameIndex = Value->getFrameIndex();
-        MMO = *o;
-        return true;
-      }
-  }
-  return false;
-}
-
 unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                           int &FrameIndex) const {
   if (isFrameStoreOpcode(MI->getOpcode()))
@@ -892,24 +1088,6 @@ unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
   return 0;
 }
 
-bool X86InstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
-                                       const MachineMemOperand *&MMO,
-                                       int &FrameIndex) const {
-  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
-         oe = MI->memoperands_end();
-       o != oe;
-       ++o) {
-    if ((*o)->isStore() && (*o)->getValue())
-      if (const FixedStackPseudoSourceValue *Value =
-          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
-        FrameIndex = Value->getFrameIndex();
-        MMO = *o;
-        return true;
-      }
-  }
-  return false;
-}
-
 /// regIsPICBase - Return true if register is PIC base (i.e.g defined by
 /// X86::MOVPC32r.
 static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
@@ -941,12 +1119,20 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
     case X86::MOVUPSrm:
     case X86::MOVAPDrm:
     case X86::MOVDQArm:
+    case X86::VMOVSSrm:
+    case X86::VMOVSDrm:
+    case X86::VMOVAPSrm:
+    case X86::VMOVUPSrm:
+    case X86::VMOVAPDrm:
+    case X86::VMOVDQArm:
     case X86::VMOVAPSYrm:
     case X86::VMOVUPSYrm:
     case X86::VMOVAPDYrm:
     case X86::VMOVDQAYrm:
     case X86::MMX_MOVD64rm:
     case X86::MMX_MOVQ64rm:
+    case X86::FsVMOVAPSrm:
+    case X86::FsVMOVAPDrm:
     case X86::FsMOVAPSrm:
     case X86::FsMOVAPDrm: {
       // Loads from constant pools are trivially rematerializable.
@@ -1009,15 +1195,11 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) {
   MachineBasicBlock::iterator E = MBB.end();
 
-  // It's always safe to clobber EFLAGS at the end of a block.
-  if (I == E)
-    return true;
-
   // For compile time consideration, if we are not able to determine the
   // safety after visiting 4 instructions in each direction, we will assume
   // it's not safe.
   MachineBasicBlock::iterator Iter = I;
-  for (unsigned i = 0; i < 4; ++i) {
+  for (unsigned i = 0; Iter != E && i < 4; ++i) {
     bool SeenDef = false;
     for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
       MachineOperand &MO = Iter->getOperand(j);
@@ -1037,10 +1219,16 @@ static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
     // Skip over DBG_VALUE.
     while (Iter != E && Iter->isDebugValue())
       ++Iter;
+  }
 
-    // If we make it to the end of the block, it's safe to clobber EFLAGS.
-    if (Iter == E)
-      return true;
+  // It is safe to clobber EFLAGS at the end of a block of no successor has it
+  // live in.
+  if (Iter == E) {
+    for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
+      if ((*SI)->isLiveIn(X86::EFLAGS))
+        return false;
+    return true;
   }
 
   MachineBasicBlock::iterator B = MBB.begin();
@@ -1946,7 +2134,8 @@ static bool isHReg(unsigned Reg) {
 }
 
 // Try and copy between VR128/VR64 and GR64 registers.
-static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg) {
+static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
+                                        bool HasAVX) {
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
   // SrcReg(GR64)  -> DestReg(VR128)
@@ -1955,7 +2144,7 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg) {
   if (X86::GR64RegClass.contains(DestReg)) {
     if (X86::VR128RegClass.contains(SrcReg)) {
       // Copy from a VR128 register to a GR64 register.
-      return X86::MOVPQIto64rr;
+      return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
     } else if (X86::VR64RegClass.contains(SrcReg)) {
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
@@ -1963,12 +2152,23 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg) {
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128RegClass.contains(DestReg))
-      return X86::MOV64toPQIrr;
+      return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
     // Copy from a GR64 register to a VR64 register.
     else if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
   }
 
+  // SrcReg(FR32) -> DestReg(GR32)
+  // SrcReg(GR32) -> DestReg(FR32)
+
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
+      // Copy from a FR32 register to a GR32 register.
+      return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+
+  if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+      // Copy from a GR32 register to a FR32 register.
+      return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+
   return 0;
 }
 
@@ -1977,6 +2177,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned SrcReg,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
@@ -1988,18 +2189,21 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // Copying to or from a physical H register on x86-64 requires a NOREX
     // move.  Otherwise use a normal move.
     if ((isHReg(DestReg) || isHReg(SrcReg)) &&
-        TM.getSubtarget<X86Subtarget>().is64Bit())
+        TM.getSubtarget<X86Subtarget>().is64Bit()) {
       Opc = X86::MOV8rr_NOREX;
-    else
+      // Both operands must be encodable without an REX prefix.
+      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
+             "8-bit H register can not be copied outside GR8_NOREX");
+    } else
       Opc = X86::MOV8rr;
   } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
-    Opc = X86::MOVAPSrr;
+    Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
   else if (X86::VR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MMX_MOVQ64rr;
   else
-    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg);
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -2043,6 +2247,7 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       bool isStackAligned,
                                       const TargetMachine &TM,
                                       bool load) {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (RC->getSize()) {
   default:
     llvm_unreachable("Unknown spill size");
@@ -2061,7 +2266,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     if (X86::GR32RegClass.hasSubClassEq(RC))
       return load ? X86::MOV32rm : X86::MOV32mr;
     if (X86::FR32RegClass.hasSubClassEq(RC))
-      return load ? X86::MOVSSrm : X86::MOVSSmr;
+      return load ?
+        (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
+        (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
     if (X86::RFP32RegClass.hasSubClassEq(RC))
       return load ? X86::LD_Fp32m : X86::ST_Fp32m;
     llvm_unreachable("Unknown 4-byte regclass");
@@ -2069,7 +2276,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     if (X86::GR64RegClass.hasSubClassEq(RC))
       return load ? X86::MOV64rm : X86::MOV64mr;
     if (X86::FR64RegClass.hasSubClassEq(RC))
-      return load ? X86::MOVSDrm : X86::MOVSDmr;
+      return load ?
+        (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
+        (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
     if (X86::VR64RegClass.hasSubClassEq(RC))
       return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
     if (X86::RFP64RegClass.hasSubClassEq(RC))
@@ -2078,13 +2287,18 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   case 10:
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
-  case 16:
+  case 16: {
     assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
-      return load ? X86::MOVAPSrm : X86::MOVAPSmr;
+      return load ?
+        (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
+        (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
     else
-      return load ? X86::MOVUPSrm : X86::MOVUPSmr;
+      return load ?
+        (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
+        (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  }
   case 32:
     assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
@@ -2118,7 +2332,8 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   const MachineFunction &MF = *MBB.getParent();
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= 16) ||
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2133,7 +2348,9 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= 16;
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@@ -2151,7 +2368,8 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= 16) ||
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2164,7 +2382,9 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= 16;
+  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  bool isAligned = MMOBegin != MMOEnd &&
+                   (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
@@ -2174,6 +2394,40 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
   NewMIs.push_back(MIB);
 }
 
+/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
+/// instruction with two undef reads of the register being defined.  This is
+/// used for mapping:
+///   %xmm4 = V_SET0
+/// to:
+///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+///
+static bool Expand2AddrUndef(MachineInstr *MI, const MCInstrDesc &Desc) {
+  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
+  unsigned Reg = MI->getOperand(0).getReg();
+  MI->setDesc(Desc);
+
+  // MachineInstr::addOperand() will insert explicit operands before any
+  // implicit operands.
+  MachineInstrBuilder(MI).addReg(Reg, RegState::Undef)
+                         .addReg(Reg, RegState::Undef);
+  // But we don't trust that.
+  assert(MI->getOperand(1).getReg() == Reg &&
+         MI->getOperand(2).getReg() == Reg && "Misplaced operand");
+  return true;
+}
+
+bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  switch (MI->getOpcode()) {
+  case X86::V_SET0:
+    return Expand2AddrUndef(MI, get(HasAVX ? X86::VPXORrr : X86::PXORrr));
+  case X86::TEST8ri_NOREX:
+    MI->setDesc(get(X86::TEST8ri));
+    return true;
+  }
+  return false;
+}
+
 MachineInstr*
 X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
                                        int FrameIx, uint64_t Offset,
@@ -2305,7 +2559,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       OpcodeTablePtr->find(MI->getOpcode());
     if (I != OpcodeTablePtr->end()) {
       unsigned Opcode = I->second.first;
-      unsigned MinAlign = I->second.second;
+      unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
       if (Align < MinAlign)
         return NULL;
       bool NarrowToMOV32rm = false;
@@ -2352,6 +2606,51 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   return NULL;
 }
 
+/// hasPartialRegUpdate - Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+///   movss (%rdi), %xmm0
+///   cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+///   cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::CVTSD2SSrr:
+  case X86::Int_CVTSD2SSrr:
+  case X86::CVTSS2SDrr:
+  case X86::Int_CVTSS2SDrr:
+  case X86::RCPSSr:
+  case X86::RCPSSr_Int:
+  case X86::ROUNDSDr:
+  case X86::ROUNDSSr:
+  case X86::RSQRTSSr:
+  case X86::RSQRTSSr_Int:
+  case X86::SQRTSSr:
+  case X86::SQRTSSr_Int:
+  // AVX encoded versions
+  case X86::VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::VRCPSSr:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSSr:
+  case X86::VRSQRTSSr:
+  case X86::VSQRTSSr:
+    return true;
+  }
+
+  return false;
+}
 
 MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
@@ -2360,22 +2659,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // Check switch flag
   if (NoFusing) return NULL;
 
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-    switch (MI->getOpcode()) {
-    case X86::CVTSD2SSrr:
-    case X86::Int_CVTSD2SSrr:
-    case X86::CVTSS2SDrr:
-    case X86::Int_CVTSS2SDrr:
-    case X86::RCPSSr:
-    case X86::RCPSSr_Int:
-    case X86::ROUNDSDr:
-    case X86::ROUNDSSr:
-    case X86::RSQRTSSr:
-    case X86::RSQRTSSr_Int:
-    case X86::SQRTSSr:
-    case X86::SQRTSSr_Int:
-      return 0;
-    }
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
@@ -2412,22 +2700,11 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // Check switch flag
   if (NoFusing) return NULL;
 
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
-    switch (MI->getOpcode()) {
-    case X86::CVTSD2SSrr:
-    case X86::Int_CVTSD2SSrr:
-    case X86::CVTSS2SDrr:
-    case X86::Int_CVTSS2SDrr:
-    case X86::RCPSSr:
-    case X86::RCPSSr_Int:
-    case X86::ROUNDSDr:
-    case X86::ROUNDSSr:
-    case X86::RSQRTSSr:
-    case X86::RSQRTSSr_Int:
-    case X86::SQRTSSr:
-    case X86::SQRTSSr_Int:
-      return 0;
-    }
+  // Unless optimizing for size, don't fold to avoid partial
+  // register update stalls
+  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+      hasPartialRegUpdate(MI->getOpcode()))
+    return 0;
 
   // Determine the alignment of the load.
   unsigned Alignment = 0;
@@ -2439,13 +2716,9 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     case X86::AVX_SET0PDY:
       Alignment = 32;
       break;
-    case X86::V_SET0PS:
-    case X86::V_SET0PD:
-    case X86::V_SET0PI:
+    case X86::V_SET0:
     case X86::V_SETALLONES:
-    case X86::AVX_SET0PS:
-    case X86::AVX_SET0PD:
-    case X86::AVX_SET0PI:
+    case X86::AVX_SETALLONES:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -2481,18 +2754,16 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
-  case X86::V_SET0PS:
-  case X86::V_SET0PD:
-  case X86::V_SET0PI:
+  case X86::V_SET0:
   case X86::V_SETALLONES:
-  case X86::AVX_SET0PS:
-  case X86::AVX_SET0PD:
-  case X86::AVX_SET0PI:
   case X86::AVX_SET0PSY:
   case X86::AVX_SET0PDY:
+  case X86::AVX_SETALLONES:
   case X86::FsFLD0SD:
-  case X86::FsFLD0SS: {
-    // Folding a V_SET0P? or V_SETALLONES as a load, to ease register pressure.
+  case X86::FsFLD0SS:
+  case X86::VFsFLD0SD:
+  case X86::VFsFLD0SS: {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
     // Medium and large mode can't fold loads this way.
@@ -2515,7 +2786,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
     // Create a constant-pool entry.
     MachineConstantPool &MCP = *MF.getConstantPool();
-    const Type *Ty;
+    Type *Ty;
     unsigned Opc = LoadMI->getOpcode();
     if (Opc == X86::FsFLD0SS || Opc == X86::VFsFLD0SS)
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
@@ -2525,9 +2796,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
-    const Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
-                    Constant::getAllOnesValue(Ty) :
-                    Constant::getNullValue(Ty);
+
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES);
+    const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
+                                    Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
 
     // Create operands to load from the constant pool entry.
@@ -2615,9 +2887,9 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
-  unsigned Index = I->second.second & 0xf;
-  bool FoldedLoad = I->second.second & (1 << 4);
-  bool FoldedStore = I->second.second & (1 << 5);
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return false;
   UnfoldLoad &= FoldedLoad;
@@ -2743,9 +3015,9 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
-  unsigned Index = I->second.second & 0xf;
-  bool FoldedLoad = I->second.second & (1 << 4);
-  bool FoldedStore = I->second.second & (1 << 5);
+  unsigned Index = I->second.second & TB_INDEX_MASK;
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI);
   unsigned NumDefs = MCID.NumDefs;
@@ -2780,7 +3052,9 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
         !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned load.
       return false;
-    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
                               VT, MVT::Other, &AddrOps[0], AddrOps.size());
     NewNodes.push_back(Load);
@@ -2822,7 +3096,9 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
         !TM.getSubtarget<X86Subtarget>().isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned store.
       return false;
-    bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= 16;
+    unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+    bool isAligned = (*MMOs.first) &&
+                     (*MMOs.first)->getAlignment() >= Alignment;
     SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
                                                          isAligned, TM),
                                        dl, MVT::Other,
@@ -2843,14 +3119,14 @@ unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
     MemOp2RegOpTable.find(Opc);
   if (I == MemOp2RegOpTable.end())
     return 0;
-  bool FoldedLoad = I->second.second & (1 << 4);
-  bool FoldedStore = I->second.second & (1 << 5);
+  bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
+  bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return 0;
   if (UnfoldStore && !FoldedStore)
     return 0;
   if (LoadRegIndex)
-    *LoadRegIndex = I->second.second & 0xf;
+    *LoadRegIndex = I->second.second & TB_INDEX_MASK;
   return I->second.first;
 }
 
@@ -2881,6 +3157,16 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
@@ -2908,6 +3194,16 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
+  // AVX load instructions
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
@@ -3007,31 +3303,6 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
            RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
 }
 
-
-/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or higher)
-/// register?  e.g. r8, xmm8, xmm13, etc.
-bool X86InstrInfo::isX86_64ExtendedReg(unsigned RegNo) {
-  switch (RegNo) {
-  default: break;
-  case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
-  case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
-  case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
-  case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
-  case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
-  case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
-  case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
-  case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
-  case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
-  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
-  case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
-  case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
-  case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
-  case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
-    return true;
-  }
-  return false;
-}
-
 /// getGlobalBaseReg - Return a virtual register initialized with the
 /// the global base register value. Output instructions required to
 /// initialize the register in the function entry block, if necessary.
@@ -3072,7 +3343,6 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
   { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
   { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
-  { X86::V_SET0PS,   X86::V_SET0PD,  X86::V_SET0PI  },
   { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
   { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
   // AVX 128-bit support
@@ -3088,7 +3358,6 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
   { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
   { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
-  { X86::AVX_SET0PS, X86::AVX_SET0PD, X86::AVX_SET0PI },
   { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
   { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
   // AVX 256-bit support
@@ -3111,13 +3380,13 @@ static const unsigned *lookup(unsigned opcode, unsigned domain) {
 }
 
 std::pair<uint16_t, uint16_t>
-X86InstrInfo::GetSSEDomain(const MachineInstr *MI) const {
+X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   return std::make_pair(domain,
                         domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
 }
 
-void X86InstrInfo::SetSSEDomain(MachineInstr *MI, unsigned Domain) const {
+void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   assert(Domain>0 && Domain<4 && "Invalid execution domain");
   uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
@@ -3158,6 +3427,29 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::SQRTSSm_Int:
   case X86::SQRTSSr:
   case X86::SQRTSSr_Int:
+  // AVX instructions with high latency
+  case X86::VDIVSDrm:
+  case X86::VDIVSDrm_Int:
+  case X86::VDIVSDrr:
+  case X86::VDIVSDrr_Int:
+  case X86::VDIVSSrm:
+  case X86::VDIVSSrm_Int:
+  case X86::VDIVSSrr:
+  case X86::VDIVSSrr_Int:
+  case X86::VSQRTPDm:
+  case X86::VSQRTPDm_Int:
+  case X86::VSQRTPDr:
+  case X86::VSQRTPDr_Int:
+  case X86::VSQRTPSm:
+  case X86::VSQRTPSm_Int:
+  case X86::VSQRTPSr:
+  case X86::VSQRTPSr_Int:
+  case X86::VSQRTSDm:
+  case X86::VSQRTSDm_Int:
+  case X86::VSQRTSDr:
+  case X86::VSQRTSSm:
+  case X86::VSQRTSSm_Int:
+  case X86::VSQRTSSr:
     return true;
   }
 }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 5f2eba3..97009db 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -27,24 +27,6 @@ namespace llvm {
   class X86TargetMachine;
 
 namespace X86 {
-  // Enums for memory operand decoding.  Each memory operand is represented with
-  // a 5 operand sequence in the form:
-  //   [BaseReg, ScaleAmt, IndexReg, Disp, Segment]
-  // These enums help decode this.
-  enum {
-    AddrBaseReg = 0,
-    AddrScaleAmt = 1,
-    AddrIndexReg = 2,
-    AddrDisp = 3,
-
-    /// AddrSegmentReg - The operand # of the segment in the memory operand.
-    AddrSegmentReg = 4,
-
-    /// AddrNumOperands - Total number of operands in a memory reference.
-    AddrNumOperands = 5
-  };
-
-
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
   enum CondCode {
@@ -82,133 +64,8 @@ namespace X86 {
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
   CondCode GetOppositeBranchCondition(X86::CondCode CC);
+}  // end namespace X86;
 
-}
-
-/// X86II - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace X86II {
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // X86 Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
-    /// relocation of:
-    ///    SYMBOL_LABEL + [. - PICBASELABEL]
-    MO_GOT_ABSOLUTE_ADDRESS,
-
-    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
-    /// immediate should get the value of the symbol minus the PIC base label:
-    ///    SYMBOL_LABEL - PICBASELABEL
-    MO_PIC_BASE_OFFSET,
-
-    /// MO_GOT - On a symbol operand this indicates that the immediate is the
-    /// offset to the GOT entry for the symbol name from the base of the GOT.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @GOT
-    MO_GOT,
-
-    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
-    /// the offset to the location of the symbol name from the base of the GOT.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @GOTOFF
-    MO_GOTOFF,
-
-    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
-    /// offset to the GOT entry for the symbol name from the current code
-    /// location.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @GOTPCREL
-    MO_GOTPCREL,
-
-    /// MO_PLT - On a symbol operand this indicates that the immediate is
-    /// offset to the PLT entry of symbol name from the current code location.
-    ///
-    /// See the X86-64 ELF ABI supplement for more details.
-    ///    SYMBOL_LABEL @PLT
-    MO_PLT,
-
-    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @TLSGD
-    MO_TLSGD,
-
-    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @GOTTPOFF
-    MO_GOTTPOFF,
-
-    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @INDNTPOFF
-    MO_INDNTPOFF,
-
-    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @TPOFF
-    MO_TPOFF,
-
-    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// See 'ELF Handling for Thread-Local Storage' for more details.
-    ///    SYMBOL_LABEL @NTPOFF
-    MO_NTPOFF,
-
-    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "__imp_FOO" symbol.  This is used for
-    /// dllimport linkage on windows.
-    MO_DLLIMPORT,
-
-    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
-    /// and jumps to external functions on Tiger and earlier.
-    MO_DARWIN_STUB,
-
-    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
-    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
-    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
-    MO_DARWIN_NONLAZY,
-
-    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
-    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
-    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
-    MO_DARWIN_NONLAZY_PIC_BASE,
-
-    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
-    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
-    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
-    /// stub.
-    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE,
-
-    /// MO_TLVP - On a symbol operand this indicates that the immediate is
-    /// some TLS offset.
-    ///
-    /// This is the TLS offset for the Darwin TLS mechanism.
-    MO_TLVP,
-
-    /// MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate
-    /// is some TLS offset from the picbase.
-    ///
-    /// This is the 32-bit TLS offset for Darwin TLS in PIC mode.
-    MO_TLVP_PIC_BASE
-  };
-}
 
 /// isGlobalStubReference - Return true if the specified TargetFlag operand is
 /// a reference to a stub for a global, not the global itself.
@@ -243,353 +100,6 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
   }
 }
 
-/// X86II - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace X86II {
-  enum {
-    //===------------------------------------------------------------------===//
-    // Instruction encodings.  These are the standard/most common forms for X86
-    // instructions.
-    //
-
-    // PseudoFrm - This represents an instruction that is a pseudo instruction
-    // or one that has not been implemented yet.  It is illegal to code generate
-    // it, but tolerated for intermediate implementation stages.
-    Pseudo         = 0,
-
-    /// Raw - This form is for instructions that don't have any operands, so
-    /// they are just a fixed opcode value, like 'leave'.
-    RawFrm         = 1,
-
-    /// AddRegFrm - This form is used for instructions like 'push r32' that have
-    /// their one register operand added to their opcode.
-    AddRegFrm      = 2,
-
-    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
-    /// to specify a destination, which in this case is a register.
-    ///
-    MRMDestReg     = 3,
-
-    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
-    /// to specify a destination, which in this case is memory.
-    ///
-    MRMDestMem     = 4,
-
-    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
-    /// to specify a source, which in this case is a register.
-    ///
-    MRMSrcReg      = 5,
-
-    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
-    /// to specify a source, which in this case is memory.
-    ///
-    MRMSrcMem      = 6,
-
-    /// MRM[0-7][rm] - These forms are used to represent instructions that use
-    /// a Mod/RM byte, and use the middle field to hold extended opcode
-    /// information.  In the intel manual these are represented as /0, /1, ...
-    ///
-
-    // First, instructions that operate on a register r/m operand...
-    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
-    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
-
-    // Next, instructions that operate on a memory r/m operand...
-    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
-    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
-
-    // MRMInitReg - This form is used for instructions whose source and
-    // destinations are the same register.
-    MRMInitReg = 32,
-
-    //// MRM_C1 - A mod/rm byte of exactly 0xC1.
-    MRM_C1 = 33,
-    MRM_C2 = 34,
-    MRM_C3 = 35,
-    MRM_C4 = 36,
-    MRM_C8 = 37,
-    MRM_C9 = 38,
-    MRM_E8 = 39,
-    MRM_F0 = 40,
-    MRM_F8 = 41,
-    MRM_F9 = 42,
-    MRM_D0 = 45,
-    MRM_D1 = 46,
-
-    /// RawFrmImm8 - This is used for the ENTER instruction, which has two
-    /// immediates, the first of which is a 16-bit immediate (specified by
-    /// the imm encoding) and the second is a 8-bit fixed value.
-    RawFrmImm8 = 43,
-
-    /// RawFrmImm16 - This is used for CALL FAR instructions, which have two
-    /// immediates, the first of which is a 16 or 32-bit immediate (specified by
-    /// the imm encoding) and the second is a 16-bit fixed value.  In the AMD
-    /// manual, this operand is described as pntr16:32 and pntr16:16
-    RawFrmImm16 = 44,
-
-    FormMask       = 63,
-
-    //===------------------------------------------------------------------===//
-    // Actual flags...
-
-    // OpSize - Set if this instruction requires an operand size prefix (0x66),
-    // which most often indicates that the instruction operates on 16 bit data
-    // instead of 32 bit data.
-    OpSize      = 1 << 6,
-
-    // AsSize - Set if this instruction requires an operand size prefix (0x67),
-    // which most often indicates that the instruction address 16 bit address
-    // instead of 32 bit address (or 32 bit address in 64 bit mode).
-    AdSize      = 1 << 7,
-
-    //===------------------------------------------------------------------===//
-    // Op0Mask - There are several prefix bytes that are used to form two byte
-    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
-    // used to obtain the setting of this field.  If no bits in this field is
-    // set, there is no prefix byte for obtaining a multibyte opcode.
-    //
-    Op0Shift    = 8,
-    Op0Mask     = 0x1F << Op0Shift,
-
-    // TB - TwoByte - Set if this instruction has a two byte opcode, which
-    // starts with a 0x0F byte before the real opcode.
-    TB          = 1 << Op0Shift,
-
-    // REP - The 0xF3 prefix byte indicating repetition of the following
-    // instruction.
-    REP         = 2 << Op0Shift,
-
-    // D8-DF - These escape opcodes are used by the floating point unit.  These
-    // values must remain sequential.
-    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
-    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
-    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
-    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
-
-    // XS, XD - These prefix codes are for single and double precision scalar
-    // floating point operations performed in the SSE registers.
-    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
-
-    // T8, TA, A6, A7 - Prefix after the 0x0F prefix.
-    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
-    A6 = 15 << Op0Shift,  A7 = 16 << Op0Shift,
-
-    // TF - Prefix before and after 0x0F
-    TF = 17 << Op0Shift,
-
-    //===------------------------------------------------------------------===//
-    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
-    // They are used to specify GPRs and SSE registers, 64-bit operand size,
-    // etc. We only cares about REX.W and REX.R bits and only the former is
-    // statically determined.
-    //
-    REXShift    = Op0Shift + 5,
-    REX_W       = 1 << REXShift,
-
-    //===------------------------------------------------------------------===//
-    // This three-bit field describes the size of an immediate operand.  Zero is
-    // unused so that we can tell if we forgot to set a value.
-    ImmShift = REXShift + 1,
-    ImmMask    = 7 << ImmShift,
-    Imm8       = 1 << ImmShift,
-    Imm8PCRel  = 2 << ImmShift,
-    Imm16      = 3 << ImmShift,
-    Imm16PCRel = 4 << ImmShift,
-    Imm32      = 5 << ImmShift,
-    Imm32PCRel = 6 << ImmShift,
-    Imm64      = 7 << ImmShift,
-
-    //===------------------------------------------------------------------===//
-    // FP Instruction Classification...  Zero is non-fp instruction.
-
-    // FPTypeMask - Mask for all of the FP types...
-    FPTypeShift = ImmShift + 3,
-    FPTypeMask  = 7 << FPTypeShift,
-
-    // NotFP - The default, set for instructions that do not use FP registers.
-    NotFP      = 0 << FPTypeShift,
-
-    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
-    ZeroArgFP  = 1 << FPTypeShift,
-
-    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
-    OneArgFP   = 2 << FPTypeShift,
-
-    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
-    // result back to ST(0).  For example, fcos, fsqrt, etc.
-    //
-    OneArgFPRW = 3 << FPTypeShift,
-
-    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
-    // explicit argument, storing the result to either ST(0) or the implicit
-    // argument.  For example: fadd, fsub, fmul, etc...
-    TwoArgFP   = 4 << FPTypeShift,
-
-    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
-    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
-    CompareFP  = 5 << FPTypeShift,
-
-    // CondMovFP - "2 operand" floating point conditional move instructions.
-    CondMovFP  = 6 << FPTypeShift,
-
-    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
-    SpecialFP  = 7 << FPTypeShift,
-
-    // Lock prefix
-    LOCKShift = FPTypeShift + 3,
-    LOCK = 1 << LOCKShift,
-
-    // Segment override prefixes. Currently we just need ability to address
-    // stuff in gs and fs segments.
-    SegOvrShift = LOCKShift + 1,
-    SegOvrMask  = 3 << SegOvrShift,
-    FS          = 1 << SegOvrShift,
-    GS          = 2 << SegOvrShift,
-
-    // Execution domain for SSE instructions in bits 23, 24.
-    // 0 in bits 23-24 means normal, non-SSE instruction.
-    SSEDomainShift = SegOvrShift + 2,
-
-    OpcodeShift   = SSEDomainShift + 2,
-
-    //===------------------------------------------------------------------===//
-    /// VEX - The opcode prefix used by AVX instructions
-    VEXShift = OpcodeShift + 8,
-    VEX         = 1U << 0,
-
-    /// VEX_W - Has a opcode specific functionality, but is used in the same
-    /// way as REX_W is for regular SSE instructions.
-    VEX_W       = 1U << 1,
-
-    /// VEX_4V - Used to specify an additional AVX/SSE register. Several 2
-    /// address instructions in SSE are represented as 3 address ones in AVX
-    /// and the additional register is encoded in VEX_VVVV prefix.
-    VEX_4V      = 1U << 2,
-
-    /// VEX_I8IMM - Specifies that the last register used in a AVX instruction,
-    /// must be encoded in the i8 immediate field. This usually happens in
-    /// instructions with 4 operands.
-    VEX_I8IMM   = 1U << 3,
-
-    /// VEX_L - Stands for a bit in the VEX opcode prefix meaning the current
-    /// instruction uses 256-bit wide registers. This is usually auto detected
-    /// if a VR256 register is used, but some AVX instructions also have this
-    /// field marked when using a f256 memory references.
-    VEX_L       = 1U << 4,
-
-    /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
-    /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
-    /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
-    /// storing a classifier in the imm8 field.  To simplify our implementation,
-    /// we handle this by storeing the classifier in the opcode field and using
-    /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
-    Has3DNow0F0FOpcode = 1U << 5
-  };
-
-  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
-  // specified machine instruction.
-  //
-  static inline unsigned char getBaseOpcodeFor(uint64_t TSFlags) {
-    return TSFlags >> X86II::OpcodeShift;
-  }
-
-  static inline bool hasImm(uint64_t TSFlags) {
-    return (TSFlags & X86II::ImmMask) != 0;
-  }
-
-  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
-  /// of the specified instruction.
-  static inline unsigned getSizeOfImm(uint64_t TSFlags) {
-    switch (TSFlags & X86II::ImmMask) {
-    default: assert(0 && "Unknown immediate size");
-    case X86II::Imm8:
-    case X86II::Imm8PCRel:  return 1;
-    case X86II::Imm16:
-    case X86II::Imm16PCRel: return 2;
-    case X86II::Imm32:
-    case X86II::Imm32PCRel: return 4;
-    case X86II::Imm64:      return 8;
-    }
-  }
-
-  /// isImmPCRel - Return true if the immediate of the specified instruction's
-  /// TSFlags indicates that it is pc relative.
-  static inline unsigned isImmPCRel(uint64_t TSFlags) {
-    switch (TSFlags & X86II::ImmMask) {
-    default: assert(0 && "Unknown immediate size");
-    case X86II::Imm8PCRel:
-    case X86II::Imm16PCRel:
-    case X86II::Imm32PCRel:
-      return true;
-    case X86II::Imm8:
-    case X86II::Imm16:
-    case X86II::Imm32:
-    case X86II::Imm64:
-      return false;
-    }
-  }
-
-  /// getMemoryOperandNo - The function returns the MCInst operand # for the
-  /// first field of the memory operand.  If the instruction doesn't have a
-  /// memory operand, this returns -1.
-  ///
-  /// Note that this ignores tied operands.  If there is a tied register which
-  /// is duplicated in the MCInst (e.g. "EAX = addl EAX, [mem]") it is only
-  /// counted as one operand.
-  ///
-  static inline int getMemoryOperandNo(uint64_t TSFlags) {
-    switch (TSFlags & X86II::FormMask) {
-    case X86II::MRMInitReg:  assert(0 && "FIXME: Remove this form");
-    default: assert(0 && "Unknown FormMask value in getMemoryOperandNo!");
-    case X86II::Pseudo:
-    case X86II::RawFrm:
-    case X86II::AddRegFrm:
-    case X86II::MRMDestReg:
-    case X86II::MRMSrcReg:
-    case X86II::RawFrmImm8:
-    case X86II::RawFrmImm16:
-       return -1;
-    case X86II::MRMDestMem:
-      return 0;
-    case X86II::MRMSrcMem: {
-      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-      unsigned FirstMemOp = 1;
-      if (HasVEX_4V)
-        ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
-
-      // FIXME: Maybe lea should have its own form?  This is a horrible hack.
-      //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
-      //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
-      return FirstMemOp;
-    }
-    case X86II::MRM0r: case X86II::MRM1r:
-    case X86II::MRM2r: case X86II::MRM3r:
-    case X86II::MRM4r: case X86II::MRM5r:
-    case X86II::MRM6r: case X86II::MRM7r:
-      return -1;
-    case X86II::MRM0m: case X86II::MRM1m:
-    case X86II::MRM2m: case X86II::MRM3m:
-    case X86II::MRM4m: case X86II::MRM5m:
-    case X86II::MRM6m: case X86II::MRM7m:
-      return 0;
-    case X86II::MRM_C1:
-    case X86II::MRM_C2:
-    case X86II::MRM_C3:
-    case X86II::MRM_C4:
-    case X86II::MRM_C8:
-    case X86II::MRM_C9:
-    case X86II::MRM_E8:
-    case X86II::MRM_F0:
-    case X86II::MRM_F8:
-    case X86II::MRM_F9:
-    case X86II::MRM_D0:
-    case X86II::MRM_D1:
-      return -1;
-    }
-  }
-}
-
 inline static bool isScale(const MachineOperand &MO) {
   return MO.isImm() &&
     (MO.getImm() == 1 || MO.getImm() == 2 ||
@@ -621,14 +131,22 @@ class X86InstrInfo : public X86GenInstrInfo {
   /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
   /// RegOp2MemOpTable2 - Load / store folding opcode maps.
   ///
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable2Addr;
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable0;
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable1;
-  DenseMap<unsigned, std::pair<unsigned,unsigned> > RegOp2MemOpTable2;
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
+  RegOp2MemOpTableType RegOp2MemOpTable2Addr;
+  RegOp2MemOpTableType RegOp2MemOpTable0;
+  RegOp2MemOpTableType RegOp2MemOpTable1;
+  RegOp2MemOpTableType RegOp2MemOpTable2;
 
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
-  DenseMap<unsigned, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
+  typedef DenseMap<unsigned,
+                   std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
+  MemOp2RegOpTableType MemOp2RegOpTable;
+
+  void AddTableEntry(RegOp2MemOpTableType &R2MTable,
+                     MemOp2RegOpTableType &M2RTable,
+                     unsigned RegOp, unsigned MemOp, unsigned Flags);
 
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
@@ -656,17 +174,6 @@ public:
   unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
                                      int &FrameIndex) const;
 
-  /// hasLoadFromStackSlot - If the specified machine instruction has
-  /// a load from a stack slot, return true along with the FrameIndex
-  /// of the loaded stack slot and the machine mem operand containing
-  /// the reference.  If not, return false.  Unlike
-  /// isLoadFromStackSlot, this returns true for any instructions that
-  /// loads from the stack.  This is a hint only and may not catch all
-  /// cases.
-  bool hasLoadFromStackSlot(const MachineInstr *MI,
-                            const MachineMemOperand *&MMO,
-                            int &FrameIndex) const;
-
   unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
@@ -674,16 +181,6 @@ public:
   unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
                                     int &FrameIndex) const;
 
-  /// hasStoreToStackSlot - If the specified machine instruction has a
-  /// store to a stack slot, return true along with the FrameIndex of
-  /// the loaded stack slot and the machine mem operand containing the
-  /// reference.  If not, return false.  Unlike isStoreToStackSlot,
-  /// this returns true for any instructions that loads from the
-  /// stack.  This is a hint only and may not catch all cases.
-  bool hasStoreToStackSlot(const MachineInstr *MI,
-                           const MachineMemOperand *&MMO,
-                           int &FrameIndex) const;
-
   bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
                                          AliasAnalysis *AA) const;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
@@ -750,6 +247,9 @@ public:
                                MachineInstr::mmo_iterator MMOBegin,
                                MachineInstr::mmo_iterator MMOEnd,
                                SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
   virtual
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
                                          int FrameIx, uint64_t Offset,
@@ -829,32 +329,21 @@ public:
   /// instruction that defines the specified register class.
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
 
-  static bool isX86_64NonExtLowByteReg(unsigned reg) {
-    return (reg == X86::SPL || reg == X86::BPL ||
-          reg == X86::SIL || reg == X86::DIL);
-  }
-
   static bool isX86_64ExtendedReg(const MachineOperand &MO) {
     if (!MO.isReg()) return false;
-    return isX86_64ExtendedReg(MO.getReg());
+    return X86II::isX86_64ExtendedReg(MO.getReg());
   }
 
-  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
-  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
-  static bool isX86_64ExtendedReg(unsigned RegNo);
-
   /// getGlobalBaseReg - Return a virtual register initialized with the
   /// the global base register value. Output instructions required to
   /// initialize the register in the function entry block, if necessary.
   ///
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
-  /// GetSSEDomain - Return the SSE execution domain of MI as the first element,
-  /// and a bitmask of possible arguments to SetSSEDomain ase the second.
-  std::pair<uint16_t, uint16_t> GetSSEDomain(const MachineInstr *MI) const;
+  std::pair<uint16_t, uint16_t>
+  getExecutionDomain(const MachineInstr *MI) const;
 
-  /// SetSSEDomain - Set the SSEDomain of MI.
-  void SetSSEDomain(MachineInstr *MI, unsigned Domain) const;
+  void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
 
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 7eb07b0..d54bf27 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -65,7 +65,7 @@ def SDTX86SetCC_C : SDTypeProfile<1, 2,
 
 def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>,
                                      SDTCisVT<2, i8>]>;
-def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def SDTX86caspair : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 
 def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
                                 SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
@@ -97,6 +97,8 @@ def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
 def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
@@ -133,9 +135,13 @@ def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
 def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
-def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8,
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86caspair,
+                        [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
+
 def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
                         [SDNPHasChain, SDNPMayStore,
                          SDNPMayLoad, SDNPMemOperand]>;
@@ -218,12 +224,16 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
+def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDTX86Void,
                           [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
 
+def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
+                          [SDNPHasChain]>;
+
 def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
@@ -331,6 +341,11 @@ class ImmSExtAsmOperandClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+class ImmZExtAsmOperandClass : AsmOperandClass {
+  let SuperClasses = [ImmAsmOperand];
+  let RenderMethod = "addImmOperands";
+}
+
 // Sign-extended immediate classes. We don't need to define the full lattice
 // here because there is no instruction with an ambiguity between ImmSExti64i32
 // and ImmSExti32i8.
@@ -358,6 +373,12 @@ def ImmSExti32i8AsmOperand : ImmSExtAsmOperandClass {
   let Name = "ImmSExti32i8";
 }
 
+// [0, 0x000000FF]
+def ImmZExtu32u8AsmOperand : ImmZExtAsmOperandClass {
+  let Name = "ImmZExtu32u8";
+}
+
+
 // [0, 0x0000007F]                                            |
 //   [0xFFFFFFFFFFFFFF80, 0xFFFFFFFFFFFFFFFF]
 def ImmSExti64i8AsmOperand : ImmSExtAsmOperandClass {
@@ -377,6 +398,11 @@ def i32i8imm  : Operand<i32> {
   let ParserMatchClass = ImmSExti32i8AsmOperand;
   let OperandType = "OPERAND_IMMEDIATE";
 }
+// 32-bits but only 8 bits are significant, and those 8 bits are unsigned.
+def u32u8imm  : Operand<i32> {
+  let ParserMatchClass = ImmZExtu32u8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
+}
 
 // 64-bits but only 32 bits are significant.
 def i64i32imm  : Operand<i64> {
@@ -389,11 +415,13 @@ def i64i32imm  : Operand<i64> {
 def i64i32imm_pcrel : Operand<i64> {
   let PrintMethod = "print_pcrel_imm";
   let ParserMatchClass = X86AbsMemAsmOperand;
+  let OperandType = "OPERAND_PCREL";
 }
 
 // 64-bits but only 8 bits are significant.
 def i64i8imm   : Operand<i64> {
   let ParserMatchClass = ImmSExti64i8AsmOperand;
+  let OperandType = "OPERAND_IMMEDIATE";
 }
 
 def lea64_32mem : Operand<i32> {
@@ -442,18 +470,33 @@ def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasXMMInt    : Predicate<"Subtarget->hasXMMInt()">;
 
+def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
 def HasCLMUL     : Predicate<"Subtarget->hasCLMUL()">;
 def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
+def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
+def HasF16C      : Predicate<"Subtarget->hasF16C()">;
+def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
+def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def FPStackf32   : Predicate<"!Subtarget->hasXMM()">;
 def FPStackf64   : Predicate<"!Subtarget->hasXMMInt()">;
+def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def In32BitMode  : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
                              AssemblerPredicate<"Mode64Bit">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">,
+                             AssemblerPredicate<"ModeNaCl">;
+def IsNaCl32     : Predicate<"Subtarget->isTargetNaCl32()">,
+                             AssemblerPredicate<"ModeNaCl,!Mode64Bit">;
+def IsNaCl64     : Predicate<"Subtarget->isTargetNaCl64()">,
+                             AssemblerPredicate<"ModeNaCl,Mode64Bit">;
+def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">,
+                             AssemblerPredicate<"!ModeNaCl">;
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
 def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
 def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
@@ -766,30 +809,30 @@ def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
-def MOVSB : I<0xA4, RawFrm, (outs), (ins), "{movsb}", []>;
-def MOVSW : I<0xA5, RawFrm, (outs), (ins), "{movsw}", []>, OpSize;
-def MOVSD : I<0xA5, RawFrm, (outs), (ins), "{movsl|movsd}", []>;
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "movsb", []>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "movsw", []>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "movs{l|d}", []>;
 def MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "movsq", []>;
 }
 
 // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
 let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
-def STOSB : I<0xAA, RawFrm, (outs), (ins), "{stosb}", []>;
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "stosb", []>;
 let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
-def STOSW : I<0xAB, RawFrm, (outs), (ins), "{stosw}", []>, OpSize;
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "stosw", []>, OpSize;
 let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
-def STOSD : I<0xAB, RawFrm, (outs), (ins), "{stosl|stosd}", []>;
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "stos{l|d}", []>;
 let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI,EFLAGS] in
 def STOSQ : RI<0xAB, RawFrm, (outs), (ins), "stosq", []>;
 
-def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scas{b}", []>;
-def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scas{w}", []>, OpSize;
-def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l}", []>;
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scasb", []>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scasw", []>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l|d}", []>;
 def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scasq", []>;
 
-def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmps{b}", []>;
-def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmps{w}", []>, OpSize;
-def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l}", []>;
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmpsb", []>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmpsw", []>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l|d}", []>;
 def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmpsq", []>;
 
 
@@ -841,22 +884,22 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|%al, $src}", []>,
+                   "mov{b}\t{$src, %al|AL, $src}", []>,
                    Requires<[In32BitMode]>;
 def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|%ax, $src}", []>, OpSize,
+                      "mov{w}\t{$src, %ax|AL, $src}", []>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|%eax, $src}", []>,
+                      "mov{l}\t{$src, %eax|EAX, $src}", []>,
                      Requires<[In32BitMode]>;
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, %al}", []>,
+                   "mov{b}\t{%al, $dst|$dst, AL}", []>,
                   Requires<[In32BitMode]>;
 def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, %ax}", []>, OpSize,
+                      "mov{w}\t{%ax, $dst|$dst, AL}", []>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, %eax}", []>,
+                      "mov{l}\t{%eax, $dst|$dst, EAX}", []>,
                      Requires<[In32BitMode]>;
 
 // FIXME: These definitions are utterly broken
@@ -865,13 +908,13 @@ def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
 // in question.
 /*
 def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
-                      "mov{q}\t{$src, %rax|%rax, $src}", []>;
+                      "mov{q}\t{$src, %rax|RAX, $src}", []>;
 def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
-                       "mov{q}\t{$src, %rax|%rax, $src}", []>;
+                       "mov{q}\t{$src, %rax|RAX, $src}", []>;
 def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
-                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
 def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
-                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
 */
 
 
@@ -926,7 +969,7 @@ let mayStore = 1 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
-let mayLoad = 1,
+let mayLoad = 1, neverHasSideEffects = 1,
     canFoldAsLoad = 1, isReMaterializable = 1 in
 def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
                      (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
@@ -1117,11 +1160,15 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
 }
 
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
-                  "xchg{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+                  "xchg{w}\t{$src, %ax|AX, $src}", []>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
-                  "xchg{l}\t{$src, %eax|%eax, $src}", []>;
+                  "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In32BitMode]>;
+// Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
+// xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
+def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
+                   "xchg{l}\t{$src, %eax|EAX, $src}", []>, Requires<[In64BitMode]>;
 def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
-                  "xchg{q}\t{$src, %rax|%rax, $src}", []>;
+                  "xchg{q}\t{$src, %rax|RAX, $src}", []>;
 
 
 
@@ -1172,7 +1219,7 @@ def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
 def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
-                    "cmpxchg16b\t$dst", []>, TB;
+                    "cmpxchg16b\t$dst", []>, TB, Requires<[HasCmpxchg16b]>;
 
 
 
@@ -1261,6 +1308,104 @@ def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
                  "arpl\t{$src, $dst|$dst, $src}", []>, Requires<[In32BitMode]>;
 
 //===----------------------------------------------------------------------===//
+// MOVBE Instructions
+//
+let Predicates = [HasMOVBE] in {
+  def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>, OpSize, T8;
+  def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>, T8;
+  def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>, T8;
+  def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                    "movbe{w}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR16:$src), addr:$dst)]>, OpSize, T8;
+  def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movbe{l}\t{$src, $dst|$dst, $src}",
+                    [(store (bswap GR32:$src), addr:$dst)]>, T8;
+  def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                     "movbe{q}\t{$src, $dst|$dst, $src}",
+                     [(store (bswap GR64:$src), addr:$dst)]>, T8;
+}
+
+//===----------------------------------------------------------------------===//
+// RDRAND Instruction
+//
+let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+  def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
+                    "rdrand{w}\t$dst", []>, OpSize, TB;
+  def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
+                    "rdrand{l}\t$dst", []>, TB;
+  def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
+                     "rdrand{q}\t$dst", []>, TB;
+}
+
+//===----------------------------------------------------------------------===//
+// LZCNT Instruction
+//
+let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
+  def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize;
+  def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "lzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize;
+
+  def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS;
+  def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "lzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS;
+
+  def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "lzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
+// TZCNT Instruction
+//
+let Predicates = [HasBMI], Defs = [EFLAGS] in {
+  def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
+                    OpSize;
+  def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                    "tzcnt{w}\t{$src, $dst|$dst, $src}",
+                    [(set GR16:$dst, (cttz (loadi16 addr:$src))),
+                     (implicit EFLAGS)]>, XS, OpSize;
+
+  def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS;
+  def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                    "tzcnt{l}\t{$src, $dst|$dst, $src}",
+                    [(set GR32:$dst, (cttz (loadi32 addr:$src))),
+                     (implicit EFLAGS)]>, XS;
+
+  def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                     "tzcnt{q}\t{$src, $dst|$dst, $src}",
+                     [(set GR64:$dst, (cttz (loadi64 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+}
+
+//===----------------------------------------------------------------------===//
 // Subsystems.
 //===----------------------------------------------------------------------===//
 
@@ -1646,3 +1791,9 @@ def : InstAlias<"xchgb $mem, $val", (XCHG8rm  GR8 :$val, i8mem :$mem)>;
 def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>;
 def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>;
 def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>;
+
+// xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
+def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
+def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index fe11d77..d3ced23 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -116,7 +116,217 @@ multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
 }
 
 //===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Move Instructions
+//  Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+// A 128-bit subvector extract from the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
+          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
+          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
+
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
+          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
+          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
+
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
+          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
+          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
+
+// A 128-bit subvector insert to the first 256-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
+          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
+def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasXMMInt] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
+}
+
+// Alias instructions that map fld0 to pxor for sse.
+// FIXME: Set encoding to pseudo!
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+    canFoldAsLoad = 1 in {
+  def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
+                   [(set FR32:$dst, fp32imm0)]>,
+                   Requires<[HasSSE1]>, TB, OpSize;
+  def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
+                   [(set FR64:$dst, fpimm0)]>,
+                 Requires<[HasSSE2]>, TB, OpSize;
+  def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
+                    [(set FR32:$dst, fp32imm0)]>,
+                    Requires<[HasAVX]>, TB, OpSize, VEX_4V;
+  def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
+                    [(set FR64:$dst, fpimm0)]>,
+                    Requires<[HasAVX]>, TB, OpSize, VEX_4V;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX & SSE - Zero/One Vectors
+//===----------------------------------------------------------------------===//
+
+// Alias instruction that maps zero vector to pxor / xorp* for sse.
+// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
+// swizzled by ExecutionDepsFix to pxor.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, neverHasSideEffects = 1 in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
+}
+
+def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+
+
+// The same as done above but for AVX.  The 256-bit ISA does not support PI,
+// and doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementatioan, it does not expand the instructions below like
+// X86MCInstLower does.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, Predicates = [HasAVX] in {
+def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
+                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
+def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
+                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
+}
+
+
+// AVX has no support for 256-bit integer instructions, but since the 128-bit
+// VPXOR instruction writes zero to its upper part, it's safe build zeros.
+def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
+
+def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
+          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+// FIXME: Change encoding to pseudo! This is blocked right now by the x86
+// JIT implementation, it does not expand the instructions below like
+// X86MCInstLower does.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
+  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
+  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
+
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move FP Scalar Instructions
+//
+// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
+// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
+// is used instead. Register-to-register movss/movsd is not modeled as an
+// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
+// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
 //===----------------------------------------------------------------------===//
 
 class sse12_move_rr<RegisterClass RC, ValueType vt, string asm> :
@@ -130,28 +340,57 @@ class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
          !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                         [(set RC:$dst, (mem_pat addr:$src))]>;
 
-// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
-// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
-// is used instead. Register-to-register movss/movsd is not modeled as an
-// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
-// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+// AVX
 def VMOVSSrr : sse12_move_rr<FR32, v4f32,
-                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V;
+                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
+                VEX_LIG;
 def VMOVSDrr : sse12_move_rr<FR64, v2f64,
-                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V;
+                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
+                VEX_LIG;
 
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX;
+// For the disassembler
+let isCodeGenOnly = 1 in {
+  def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                        (ins VR128:$src1, FR32:$src2),
+                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                        XS, VEX_4V, VEX_LIG;
+  def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                        (ins VR128:$src1, FR64:$src2),
+                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                        XD, VEX_4V, VEX_LIG;
+}
 
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
+                 VEX_LIG;
   let AddedComplexity = 20 in
-    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX;
+    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
+                   VEX_LIG;
 }
 
+def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
+                  "movss\t{$src, $dst|$dst, $src}",
+                  [(store FR32:$src, addr:$dst)]>, XS, VEX, VEX_LIG;
+def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>, XD, VEX, VEX_LIG;
+
+// SSE1 & 2
 let Constraints = "$src1 = $dst" in {
   def MOVSSrr : sse12_move_rr<FR32, v4f32,
                           "movss\t{$src2, $dst|$dst, $src2}">, XS;
   def MOVSDrr : sse12_move_rr<FR64, v2f64,
                           "movsd\t{$src2, $dst|$dst, $src2}">, XD;
+
+  // For the disassembler
+  let isCodeGenOnly = 1 in {
+    def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                         (ins VR128:$src1, FR32:$src2),
+                         "movss\t{$src2, $dst|$dst, $src2}", []>, XS;
+    def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+                         (ins VR128:$src1, FR64:$src2),
+                         "movsd\t{$src2, $dst|$dst, $src2}", []>, XD;
+  }
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
@@ -161,54 +400,6 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
     def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
 }
 
-let AddedComplexity = 15 in {
-// Extract the low 32-bit value from one vector and insert it into another.
-def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4f32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
-// Extract the low 64-bit value from one vector and insert it into another.
-def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2f64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
-}
-
-// Implicitly promote a 32-bit scalar to a vector.
-def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
-// Implicitly promote a 32-bit scalar to a vector.
-def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
-          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), FR32:$src, sub_ss)>;
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
-          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), FR64:$src, sub_sd)>;
-
-let AddedComplexity = 20 in {
-// MOVSSrm zeros the high parts of the register; represent this
-// with SUBREG_TO_REG.
-def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
-          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
-def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
-          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
-def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
-          (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
-// MOVSDrm zeros the high parts of the register; represent this
-// with SUBREG_TO_REG.
-def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-def : Pat<(v2f64 (X86vzload addr:$src)),
-          (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
-}
-
-// Store scalar value to memory.
 def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                   "movss\t{$src, $dst|$dst, $src}",
                   [(store FR32:$src, addr:$dst)]>;
@@ -216,24 +407,257 @@ def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                   "movsd\t{$src, $dst|$dst, $src}",
                   [(store FR64:$src, addr:$dst)]>;
 
-def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
-                  "movss\t{$src, $dst|$dst, $src}",
-                  [(store FR32:$src, addr:$dst)]>, XS, VEX;
-def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
-                  "movsd\t{$src, $dst|$dst, $src}",
-                  [(store FR64:$src, addr:$dst)]>, XD, VEX;
+// Patterns
+let Predicates = [HasSSE1] in {
+  let AddedComplexity = 15 in {
+  // Extract the low 32-bit value from one vector and insert it into another.
+  def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4f32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4i32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSS to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (MOVSSrr (v4f32 (V_SET0)),
+                     (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (MOVSSrr (v4i32 (V_SET0)),
+                     (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+  }
 
-// Extract and store.
-def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-                 addr:$dst),
-          (MOVSSmr addr:$dst,
-                   (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
-def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
-                 addr:$dst),
-          (MOVSDmr addr:$dst,
-                   (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (MOVSSrm addr:$src), sub_ss)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSSmr addr:$dst,
+                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+
+  // Shuffle with MOVSS
+  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
+            (MOVSSrr VR128:$src1, FR32:$src2)>;
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4i32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (MOVSSrr (v4f32 VR128:$src1),
+                     (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+}
+
+let Predicates = [HasSSE2] in {
+  let AddedComplexity = 15 in {
+  // Extract the low 64-bit value from one vector and insert it into another.
+  def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2f64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2i64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
+  def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+  def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVSD to the lower bits.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG.
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (MOVSDrm addr:$src), sub_sd)>;
+  }
+
+  // Extract and store.
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (MOVSDmr addr:$dst,
+                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+  // Shuffle with MOVSD
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
+            (MOVSDrr VR128:$src1, FR64:$src2)>;
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2i64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr (v2f64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),sub_sd))>;
+}
+
+let Predicates = [HasAVX] in {
+  let AddedComplexity = 15 in {
+  // Extract the low 32-bit value from one vector and insert it into another.
+  def : Pat<(v4f32 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+
+  // Extract the low 64-bit value from one vector and insert it into another.
+  def : Pat<(v2f64 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2f64 VR128:$src1),
+                      (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2i64 VR128:$src1),
+                      (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
+  def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+  def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>;
+
+  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+            (VMOVSSrr (v4f32 (V_SET0)),
+                      (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+            (VMOVSSrr (v4i32 (V_SET0)),
+                      (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_sd)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_ss)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_sd)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
+            (SUBREG_TO_REG (i32 0),
+                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
+                           sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
+            (SUBREG_TO_REG (i64 0),
+                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
+                           sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSmr addr:$dst,
+                     (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDmr addr:$dst,
+                     (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
+            (VMOVSSrr VR128:$src1, FR32:$src2)>;
+  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4i32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
+  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
+            (VMOVSSrr (v4f32 VR128:$src1),
+                      (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
+            (VMOVSDrr VR128:$src1, FR64:$src2)>;
+  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2i64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
+  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr (v2f64 VR128:$src1),
+                     (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
+  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
+                                                   sub_sd))>;
+  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
+                                                   sub_sd))>;
+
+  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
+  // is during lowering, where it's not possible to recognize the fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),
+                                                   sub_sd))>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+            (VMOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2),
+                                                   sub_sd))>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
+//===----------------------------------------------------------------------===//
 
-// Move Aligned/Unaligned floating point values
 multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
                             X86MemOperand x86memop, PatFrag ld_frag,
                             string asm, Domain d,
@@ -248,22 +672,22 @@ let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
 }
 
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
-                              "movaps", SSEPackedSingle>, VEX;
+                              "movaps", SSEPackedSingle>, TB, VEX;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
-                              "movapd", SSEPackedDouble>, OpSize, VEX;
+                              "movapd", SSEPackedDouble>, TB, OpSize, VEX;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
-                              "movups", SSEPackedSingle>, VEX;
+                              "movups", SSEPackedSingle>, TB, VEX;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
-                              "movupd", SSEPackedDouble, 0>, OpSize, VEX;
+                              "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
-                              "movaps", SSEPackedSingle>, VEX;
+                              "movaps", SSEPackedSingle>, TB, VEX;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
-                              "movapd", SSEPackedDouble>, OpSize, VEX;
+                              "movapd", SSEPackedDouble>, TB, OpSize, VEX;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
-                              "movups", SSEPackedSingle>, VEX;
+                              "movups", SSEPackedSingle>, TB, VEX;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
-                              "movupd", SSEPackedDouble, 0>, OpSize, VEX;
+                              "movupd", SSEPackedDouble, 0>, TB, OpSize, VEX;
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle>, TB;
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
@@ -287,10 +711,10 @@ def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    [(store (v2f64 VR128:$src), addr:$dst)]>, VEX;
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v8f32 VR256:$src), addr:$dst)]>, VEX;
+                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)]>, VEX;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
-                   [(alignedstore (v4f64 VR256:$src), addr:$dst)]>, VEX;
+                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)]>, VEX;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)]>, VEX;
@@ -298,6 +722,34 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)]>, VEX;
 
+// For disassembler
+let isCodeGenOnly = 1 in {
+  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
+                          (ins VR128:$src),
+                          "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movups\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
+                           (ins VR128:$src),
+                           "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movups\t{$src, $dst|$dst, $src}", []>, VEX;
+  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
+                            (ins VR256:$src),
+                            "movupd\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+
 def : Pat<(int_x86_avx_loadu_ps_256 addr:$src), (VMOVUPSYrm addr:$src)>;
 def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
           (VMOVUPSYmr addr:$dst, VR256:$src)>;
@@ -319,24 +771,155 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v2f64 VR128:$src), addr:$dst)]>;
 
-// Intrinsic forms of MOVUPS/D load and store
-def VMOVUPSmr_Int : VPSI<0x11, MRMDestMem, (outs),
-           (ins f128mem:$dst, VR128:$src),
-           "movups\t{$src, $dst|$dst, $src}",
-           [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>, VEX;
-def VMOVUPDmr_Int : VPDI<0x11, MRMDestMem, (outs),
-           (ins f128mem:$dst, VR128:$src),
-           "movupd\t{$src, $dst|$dst, $src}",
-           [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>, VEX;
-
-def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                       "movups\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
-def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
-                       "movupd\t{$src, $dst|$dst, $src}",
-                       [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
-
-// Move Low/High packed floating point values
+// For disassembler
+let isCodeGenOnly = 1 in {
+  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movaps\t{$src, $dst|$dst, $src}", []>;
+  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movapd\t{$src, $dst|$dst, $src}", []>;
+  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movups\t{$src, $dst|$dst, $src}", []>;
+  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                         "movupd\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (VMOVUPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [HasSSE1] in
+  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+let Predicates = [HasSSE2] in
+  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
+            (MOVUPDmr addr:$dst, VR128:$src)>;
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
+let Predicates = [HasSSE1] in {
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (MOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
+// Use vmovaps/vmovups for AVX integer load/store.
+let Predicates = [HasAVX] in {
+  // 128-bit load/store
+  def : Pat<(alignedloadv4i32 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv4i32 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+  def : Pat<(alignedloadv2i64 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv2i64 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+
+  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+
+  // 256-bit load/store
+  def : Pat<(alignedloadv4i64 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv4i64 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedloadv8i32 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv8i32 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+}
+
+// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let neverHasSideEffects = 1 in {
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                     "movaps\t{$src, $dst|$dst, $src}", []>;
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                     "movapd\t{$src, $dst|$dst, $src}", []>;
+def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                       "movaps\t{$src, $dst|$dst, $src}", []>, VEX;
+def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                       "movapd\t{$src, $dst|$dst, $src}", []>, VEX;
+}
+
+// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
+// bits are disregarded. FIXME: Set encoding to pseudo!
+let canFoldAsLoad = 1, isReMaterializable = 1 in {
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                     "movaps\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                     "movapd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+let isCodeGenOnly = 1 in {
+  def FsVMOVAPSrm : VPSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                         "movaps\t{$src, $dst|$dst, $src}",
+                         [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>, VEX;
+  def FsVMOVAPDrm : VPDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                         "movapd\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>, VEX;
+}
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
 multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
                                  PatFrag mov_frag, string base_opc,
                                  string asm_opr> {
@@ -359,14 +942,10 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
 let AddedComplexity = 20 in {
   defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
                      "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
-  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
-                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
 }
 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   defm MOVL : sse12_mov_hilo_packed<0x12, VR128, movlp, "movlp",
                                    "\t{$src2, $dst|$dst, $src2}">;
-  defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
-                                   "\t{$src2, $dst|$dst, $src2}">;
 }
 
 def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
@@ -386,6 +965,147 @@ def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    [(store (f64 (vector_extract (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)]>;
 
+let Predicates = [HasAVX] in {
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+    def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPSrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPSrm VR128:$src1, addr:$src2)>;
+    // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
+    def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPDrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+              (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  }
+
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
+                                 VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
+  def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with VMOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (VMOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Shuffle with VMOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
+            (VMOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                   addr:$src1),
+            (VMOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE1] in {
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+    def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPSrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPSrm VR128:$src1, addr:$src2)>;
+  }
+
+  // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+  def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)),
+                                 VR128:$src2)), addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPS
+  def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlps VR128:$src1,
+                      (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (MOVLPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
+                                      addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v4i32 (X86Movlps
+                   (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                              addr:$src1),
+            (MOVLPSmr addr:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE2] in {
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, (load v2) <2, 1> using MOVLPS
+    def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPDrm VR128:$src1, addr:$src2)>;
+    def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+              (MOVLPDrm VR128:$src1, addr:$src2)>;
+  }
+
+  // (store (vector_shuffle (load addr), v2, <2, 1>), addr) using MOVLPS
+  def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+
+  // Shuffle with MOVLPD
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Movlpd VR128:$src1,
+                              (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVLPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+  def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
+                           addr:$src1),
+            (MOVLPDmr addr:$src1, VR128:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Hi packed FP Instructions
+//===----------------------------------------------------------------------===//
+
+let AddedComplexity = 20 in {
+  defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
+                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">, VEX_4V;
+}
+let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
+  defm MOVH : sse12_mov_hilo_packed<0x16, VR128, movlhps, "movhp",
+                                   "\t{$src2, $dst|$dst, $src2}">;
+}
+
 // v2f64 extract element 1 is always custom lowered to unpack high to low
 // and extract element 0 so the non-store version isn't too horrible.
 def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
@@ -411,6 +1131,80 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                                  (v2f64 (unpckh VR128:$src, (undef))),
                                  (iPTR 0))), addr:$dst)]>;
 
+let Predicates = [HasAVX] in {
+  // VMOVHPS patterns
+  def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: This should be matched by a X86Movhpd instead. Same as above
+  def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (VMOVHPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (VMOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [HasSSE1] in {
+  // MOVHPS patterns
+  def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(X86Movlhps VR128:$src1,
+                 (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
+            (MOVHPSrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))), addr:$dst),
+            (MOVHPSmr addr:$dst, VR128:$src)>;
+}
+
+let Predicates = [HasSSE2] in {
+  // FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
+  // is during lowering, where it's not possible to recognize the load fold cause
+  // it has two uses through a bitcast. One use disappears at isel time and the
+  // fold opportunity reappears.
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // FIXME: This should be matched by a X86Movhpd instead. Same as above
+  def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
+                      (scalar_to_vector (loadf64 addr:$src2)))),
+            (MOVHPDrm VR128:$src1, addr:$src2)>;
+
+  // Store patterns
+  def : Pat<(store (f64 (vector_extract
+            (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst),
+            (MOVHPDmr addr:$dst, VR128:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+
 let AddedComplexity = 20 in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
@@ -438,13 +1232,80 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                         (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
 }
 
-def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
-          (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
-let AddedComplexity = 20 in {
-  def : Pat<(v4f32 (movddup VR128:$src, (undef))),
-            (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
-  def : Pat<(v2i64 (movddup VR128:$src, (undef))),
-            (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+let Predicates = [HasAVX] in {
+  // MOVLHPS patterns
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+              (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+    def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+              (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+
+    // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+    def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+              (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  }
+  def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+    def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+              (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+    // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+    def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+              (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
+    def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+              (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
+  }
+
+  def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE1] in {
+  // MOVLHPS patterns
+  let AddedComplexity = 20 in {
+    def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+              (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+    def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+              (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+
+    // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+    def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+              (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  }
+  def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+            (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+  // MOVHLPS patterns
+  let AddedComplexity = 20 in {
+    // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+    def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+              (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+    // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+    def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+              (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+    def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+              (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+  }
+
+  def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
+            (MOVHLPSrr VR128:$src1, VR128:$src2)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -462,10 +1323,9 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 
 multiclass sse12_cvt_s_np<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
-  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        []>;
-  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        []>;
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm, []>;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm, []>;
 }
 
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
@@ -481,36 +1341,39 @@ multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
+  let mayLoad = 1 in
   def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins DstRC:$src1, x86memop:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>;
 }
 
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
-                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX;
+                                "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
+                                VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
-                                VEX_W;
+                                VEX_W, VEX_LIG;
 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
-                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+                                "cvttsd2si\t{$src, $dst|$dst, $src}">, XD, VEX,
+                                VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si\t{$src, $dst|$dst, $src}">, XD,
-                                VEX, VEX_W;
+                                VEX, VEX_W, VEX_LIG;
 
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
 // register, but the same isn't true when only using memory operands,
 // provide other assembly "l" and "q" forms to address this explicitly
 // where appropriate to do so.
 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">, XS,
-                                  VEX_4V;
+                                  VEX_4V, VEX_LIG;
 defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, XS,
-                                  VEX_4V, VEX_W;
+                                  VEX_4V, VEX_W, VEX_LIG;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">, XD,
-                                  VEX_4V;
+                                  VEX_4V, VEX_LIG;
 defm VCVTSI2SDL  : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, XD,
-                                  VEX_4V;
+                                  VEX_4V, VEX_LIG;
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, XD,
-                                  VEX_4V, VEX_W;
+                                  VEX_4V, VEX_W, VEX_LIG;
 
 let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
@@ -579,11 +1442,6 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))]>;
 }
 
-defm Int_VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                      f32mem, load, "cvtss2si">, XS, VEX;
-defm Int_VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
-                        int_x86_sse_cvtss2si64, f32mem, load, "cvtss2si">,
-                        XS, VEX, VEX_W;
 defm Int_VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
                       f128mem, load, "cvtsd2si">, XD, VEX;
 defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
@@ -594,14 +1452,12 @@ defm Int_VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
 // Get rid of this hack or rename the intrinsics, there are several
 // intructions that only match with the intrinsic form, why create duplicates
 // to let them be recognized by the assembler?
-defm VCVTSD2SI_alt : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
-                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX;
+defm VCVTSD2SI     : sse12_cvt_s_np<0x2D, FR64, GR32, f64mem,
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64   : sse12_cvt_s_np<0x2D, FR64, GR64, f64mem,
-                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W;
-defm Int_CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
-                      f32mem, load, "cvtss2si">, XS;
-defm Int_CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
-                      f32mem, load, "cvtss2si{q}">, XS, REX_W;
+                      "cvtsd2si\t{$src, $dst|$dst, $src}">, XD, VEX, VEX_W,
+                      VEX_LIG;
+
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
                 f128mem, load, "cvtsd2si{l}">, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
@@ -660,10 +1516,11 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
 
 let Pattern = []<dag> in {
 defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load,
-                               "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS, VEX;
+                               "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS,
+                               VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, undef, f32mem, load,
                                "cvtss2si\t{$src, $dst|$dst, $src}">, XS, VEX,
-                               VEX_W;
+                               VEX_W, VEX_LIG;
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load,
                                "cvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle>, TB, VEX;
@@ -671,6 +1528,7 @@ defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, undef, i256mem, load,
                                "cvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle>, TB, VEX;
 }
+
 let Pattern = []<dag> in {
 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, undef, f32mem, load /*dummy*/,
                           "cvtss2si{l}\t{$src, $dst|$dst, $src}">, XS;
@@ -681,19 +1539,43 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, undef, i128mem, load /*dummy*/,
                             SSEPackedSingle>, TB; /* PD SSE3 form is avaiable */
 }
 
+let Predicates = [HasSSE1] in {
+  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
+            (CVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
+            (CVTSS2SIrm addr:$src)>;
+  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
+            (CVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
+            (CVTSS2SI64rm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_cvtss2si VR128:$src),
+            (VCVTSS2SIrr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si (load addr:$src)),
+            (VCVTSS2SIrm addr:$src)>;
+  def : Pat<(int_x86_sse_cvtss2si64 VR128:$src),
+            (VCVTSS2SI64rr (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+  def : Pat<(int_x86_sse_cvtss2si64 (load addr:$src)),
+            (VCVTSS2SI64rm addr:$src)>;
+}
+
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                      VEX_4V;
+                      VEX_4V, VEX_LIG;
+let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                        (ins FR64:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V;
+                      []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG;
+
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
-        Requires<[HasAVX]>;
+          Requires<[HasAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
@@ -715,13 +1597,25 @@ defm Int_CVTSD2SS: sse12_cvt_sint_3addr<0x5A, VR128, VR128,
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, Requires<[HasAVX]>, VEX_4V;
+                    []>, XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG;
+let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR32:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>;
-def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>,
-        Requires<[HasAVX]>;
+                    []>, XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(f64 (fextend FR32:$src)),
+            (VCVTSS2SDrr FR32:$src, FR32:$src)>;
+  def : Pat<(fextend (loadf32 addr:$src)),
+            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(extloadf32 addr:$src),
+            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+}
+
+def : Pat<(extloadf32 addr:$src),
+          (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (MOVSSrm addr:$src))>,
+          Requires<[HasAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
@@ -732,6 +1626,16 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
                  Requires<[HasSSE2, OptForSize]>;
 
+// extload f32 -> f64.  This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+def : Pat<(fextend (loadf32 addr:$src)),
+          (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -759,10 +1663,6 @@ def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                     Requires<[HasSSE2]>;
 }
 
-def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>,
-      Requires<[HasSSE2, OptForSpeed]>;
-
 // Convert doubleword to packed single/double fp
 // SSE2 instructions without OpSize prefix
 def Int_VCVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -862,10 +1762,12 @@ def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 // SSE2 packed instructions with XS prefix
 def VCVTTPS2DQrr : VSSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+let mayLoad = 1 in
 def VCVTTPS2DQrm : VSSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPS2DQYrr : VSSI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
+let mayLoad = 1 in
 def VCVTTPS2DQYrm : VSSI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                       "cvttps2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -877,7 +1779,6 @@ def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       [(set VR128:$dst,
                             (int_x86_sse2_cvttps2dq (memop addr:$src)))]>;
 
-
 def Int_VCVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvttps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -889,16 +1790,33 @@ def Int_VCVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                                            (memop addr:$src)))]>,
                       XS, VEX, Requires<[HasAVX]>;
 
-def Int_VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst),
-                            (ins VR128:$src),
-                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>,
-                       VEX;
-def Int_VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst),
-                          (ins f128mem:$src),
-                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
-                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                             (memop addr:$src)))]>, VEX;
+let Predicates = [HasSSE2] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (Int_CVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (CVTTPS2DQrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+            (Int_VCVTDQ2PSrr VR128:$src)>;
+  def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+            (VCVTTPS2DQrr VR128:$src)>;
+  def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
+            (VCVTDQ2PSYrr VR256:$src)>;
+  def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
+            (VCVTTPS2DQYrr VR256:$src)>;
+}
+
+def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                              (int_x86_sse2_cvttpd2dq VR128:$src))]>, VEX;
+let isCodeGenOnly = 1 in
+def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                               (memop addr:$src)))]>, VEX;
 def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       "cvttpd2dq\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
@@ -910,8 +1828,6 @@ def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
 // Provide other assembly rr and rm forms to address this explicitly.
-def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                        "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 def VCVTTPD2DQXrYr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                           "cvttpd2dq\t{$src, $dst|$dst, $src}", []>, VEX;
 
@@ -931,13 +1847,13 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
 let Predicates = [HasAVX] in {
                   // SSE2 instructions without OpSize prefix
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, VEX;
+                     "vcvtps2pd\t{$src, $dst|$dst, $src}", []>, TB, VEX;
 }
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
@@ -947,12 +1863,12 @@ def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
 def Int_VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
-                     VEX, Requires<[HasAVX]>;
+                     TB, VEX, Requires<[HasAVX]>;
 def Int_VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                        "vcvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd
                                           (load addr:$src)))]>,
-                     VEX, Requires<[HasAVX]>;
+                     TB, VEX, Requires<[HasAVX]>;
 def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
@@ -1038,75 +1954,61 @@ def : Pat<(int_x86_avx_cvtt_ps2dq_256 VR256:$src),
 def : Pat<(int_x86_avx_cvtt_ps2dq_256 (memopv8f32 addr:$src)),
           (VCVTTPS2DQYrm addr:$src)>;
 
+// Match fround and fextend for 128/256-bit conversions
+def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
+          (VCVTPD2PSYrr VR256:$src)>;
+def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
+          (VCVTPD2PSYrm addr:$src)>;
+
+def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
+          (VCVTPS2PDYrr VR128:$src)>;
+def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+          (VCVTPS2PDYrm addr:$src)>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Compare Instructions
 //===----------------------------------------------------------------------===//
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
+                            SDNode OpNode, ValueType VT, PatFrag ld_frag,
                             string asm, string asm_alt> {
-  let isAsmParserOnly = 1 in {
-    def rr : SIi8<0xC2, MRMSrcReg,
-                  (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc),
-                  asm, []>;
-    let mayLoad = 1 in
-    def rm : SIi8<0xC2, MRMSrcMem,
-                  (outs RC:$dst), (ins RC:$src1, x86memop:$src, SSECC:$cc),
-                  asm, []>;
-  }
+  def rr : SIi8<0xC2, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))]>;
+  def rm : SIi8<0xC2, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, SSECC:$cc), asm,
+                [(set RC:$dst, (OpNode (VT RC:$src1),
+                                         (ld_frag addr:$src2), imm:$cc))]>;
 
   // Accept explicit immediate argument form instead of comparison code.
-  def rr_alt : SIi8<0xC2, MRMSrcReg,
-                (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
-                asm_alt, []>;
-  let mayLoad = 1 in
-  def rm_alt : SIi8<0xC2, MRMSrcMem,
-                (outs RC:$dst), (ins RC:$src1, x86memop:$src, i8imm:$src2),
-                asm_alt, []>;
+  let neverHasSideEffects = 1 in {
+    def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
+                      (ins RC:$src1, RC:$src2, i8imm:$cc), asm_alt, []>;
+    let mayLoad = 1 in
+    def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
+                      (ins RC:$src1, x86memop:$src2, i8imm:$cc), asm_alt, []>;
+  }
 }
 
-let neverHasSideEffects = 1 in {
-  defm VCMPSS  : sse12_cmp_scalar<FR32, f32mem,
-                  "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                  "cmpss\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
-                  XS, VEX_4V;
-  defm VCMPSD  : sse12_cmp_scalar<FR64, f64mem,
-                  "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                  "cmpsd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}">,
-                  XD, VEX_4V;
-}
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
+                 "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XS, VEX_4V, VEX_LIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
+                 "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+                 XD, VEX_4V, VEX_LIG;
 
 let Constraints = "$src1 = $dst" in {
-def CMPSSrr : SIi8<0xC2, MRMSrcReg,
-                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, SSECC:$cc),
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmpss, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
-                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), FR32:$src2, imm:$cc))]>, XS;
-def CMPSSrm : SIi8<0xC2, MRMSrcMem,
-                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, SSECC:$cc),
-                  "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
-                  [(set FR32:$dst, (X86cmpss (f32 FR32:$src1), (loadf32 addr:$src2), imm:$cc))]>, XS;
-def CMPSDrr : SIi8<0xC2, MRMSrcReg,
-                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, SSECC:$cc),
-                  "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
-                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), FR64:$src2, imm:$cc))]>, XD;
-def CMPSDrm : SIi8<0xC2, MRMSrcMem,
-                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, SSECC:$cc),
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
+                  XS;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmpsd, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
-                  [(set FR64:$dst, (X86cmpsd (f64 FR64:$src1), (loadf64 addr:$src2), imm:$cc))]>, XD;
-}
-let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
-def CMPSSrr_alt : SIi8<0xC2, MRMSrcReg,
-                  (outs FR32:$dst), (ins FR32:$src1, FR32:$src, i8imm:$src2),
-                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
-def CMPSSrm_alt : SIi8<0xC2, MRMSrcMem,
-                  (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, i8imm:$src2),
-                  "cmpss\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XS;
-def CMPSDrr_alt : SIi8<0xC2, MRMSrcReg,
-                  (outs FR64:$dst), (ins FR64:$src1, FR64:$src, i8imm:$src2),
-                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
-def CMPSDrm_alt : SIi8<0xC2, MRMSrcMem,
-                  (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, i8imm:$src2),
-                  "cmpsd\t{$src2, $src, $dst|$dst, $src, $src2}", []>, XD;
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}">,
+                  XD;
 }
 
 multiclass sse12_cmp_scalar_int<RegisterClass RC, X86MemOperand x86memop,
@@ -1151,25 +2053,28 @@ multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, VEX;
+                                  "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, OpSize, VEX;
+                                  "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
+                                  VEX_LIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss", SSEPackedSingle>, VEX;
+                                    "comiss", SSEPackedSingle>, TB, VEX,
+                                    VEX_LIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd", SSEPackedDouble>, OpSize, VEX;
+                                    "comisd", SSEPackedDouble>, TB, OpSize, VEX,
+                                    VEX_LIG;
   }
 
   defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                            load, "ucomiss", SSEPackedSingle>, VEX;
+                            load, "ucomiss", SSEPackedSingle>, TB, VEX;
   defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                            load, "ucomisd", SSEPackedDouble>, OpSize, VEX;
+                            load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
 
   defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                            load, "comiss", SSEPackedSingle>, VEX;
+                            load, "comiss", SSEPackedSingle>, TB, VEX;
   defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                            load, "comisd", SSEPackedDouble>, OpSize, VEX;
+                            load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                   "ucomiss", SSEPackedSingle>, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
@@ -1199,57 +2104,82 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Domain d> {
   let isAsmParserOnly = 1 in {
     def rri : PIi8<0xC2, MRMSrcReg,
-               (outs RC:$dst), (ins RC:$src1, RC:$src, SSECC:$cc), asm,
-               [(set RC:$dst, (Int RC:$src1, RC:$src, imm:$cc))], d>;
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, SSECC:$cc), asm,
+               [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))], d>;
     def rmi : PIi8<0xC2, MRMSrcMem,
-               (outs RC:$dst), (ins RC:$src1, f128mem:$src, SSECC:$cc), asm,
-               [(set RC:$dst, (Int RC:$src1, (memop addr:$src), imm:$cc))], d>;
+               (outs RC:$dst), (ins RC:$src1, f128mem:$src2, SSECC:$cc), asm,
+               [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))], d>;
   }
 
   // Accept explicit immediate argument form instead of comparison code.
   def rri_alt : PIi8<0xC2, MRMSrcReg,
-             (outs RC:$dst), (ins RC:$src1, RC:$src, i8imm:$src2),
+             (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
              asm_alt, [], d>;
   def rmi_alt : PIi8<0xC2, MRMSrcMem,
-             (outs RC:$dst), (ins RC:$src1, f128mem:$src, i8imm:$src2),
+             (outs RC:$dst), (ins RC:$src1, f128mem:$src2, i8imm:$cc),
              asm_alt, [], d>;
 }
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
-               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedSingle>, VEX_4V;
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, TB, VEX_4V;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
-               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedDouble>, OpSize, VEX_4V;
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_ps_256,
-               "cmp${cc}ps\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmpps\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedSingle>, VEX_4V;
+               "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, TB, VEX_4V;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, int_x86_avx_cmp_pd_256,
-               "cmp${cc}pd\t{$src, $src1, $dst|$dst, $src1, $src}",
-               "cmppd\t{$src2, $src, $src1, $dst|$dst, $src1, $src, $src2}",
-               SSEPackedDouble>, OpSize, VEX_4V;
+               "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, TB, OpSize, VEX_4V;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, int_x86_sse_cmp_ps,
-                 "cmp${cc}ps\t{$src, $dst|$dst, $src}",
-                 "cmpps\t{$src2, $src, $dst|$dst, $src, $src2}",
+                 "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
+                 "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedSingle>, TB;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, int_x86_sse2_cmp_pd,
-                 "cmp${cc}pd\t{$src, $dst|$dst, $src}",
-                 "cmppd\t{$src2, $src, $dst|$dst, $src, $src2}",
+                 "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
+                 "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedDouble>, TB, OpSize;
 }
 
+let Predicates = [HasSSE1] in {
 def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
 def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [HasSSE2] in {
 def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
 def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
+def : Pat<(v8i32 (X86cmpps (v8f32 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
+          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
+def : Pat<(v4i64 (X86cmppd (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
+          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Shuffle Instructions
@@ -1293,6 +2223,132 @@ let Constraints = "$src1 = $dst" in {
                     memopv2f64, SSEPackedDouble>, TB, OpSize;
 }
 
+let Predicates = [HasSSE1] in {
+  def : Pat<(v4f32 (X86Shufps VR128:$src1,
+                       (memopv4f32 addr:$src2), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
+  // fall back to this for SSE1)
+  def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
+            (SHUFPSrri VR128:$src2, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPSrri case.
+  def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+            (SHUFPSrri VR128:$src1, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+}
+
+let Predicates = [HasSSE2] in {
+  // Special binary v4i32 shuffle cases with SHUFPS.
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+            (SHUFPSrri VR128:$src1, VR128:$src2,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
+                                (bc_v4i32 (memopv2i64 addr:$src2)))),
+            (SHUFPSrmi VR128:$src1, addr:$src2,
+                      (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPDrri cases.
+  def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+            (SHUFPDrri VR128:$src1, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+            (SHUFPDrri VR128:$src1, VR128:$src1,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special binary v2i64 shuffle cases using SHUFPDrri.
+  def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+            (SHUFPDrri VR128:$src1, VR128:$src2,
+                       (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Generic SHUFPD patterns
+  def : Pat<(v2f64 (X86Shufps VR128:$src1,
+                       (memopv2f64 addr:$src2), (i8 imm:$imm))),
+            (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86Shufps VR128:$src1,
+                       (memopv4f32 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1,
+                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  // vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
+  // fall back to this for SSE1)
+  def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
+            (VSHUFPSrri VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPSrri case.
+  def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+            (VSHUFPSrri VR128:$src1, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special binary v4i32 shuffle cases with SHUFPS.
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+            (VSHUFPSrri VR128:$src1, VR128:$src2,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v4i32 (shufp:$src3 VR128:$src1,
+                                (bc_v4i32 (memopv2i64 addr:$src2)))),
+            (VSHUFPSrmi VR128:$src1, addr:$src2,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special unary SHUFPDrri cases.
+  def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+            (VSHUFPDrri VR128:$src1, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+            (VSHUFPDrri VR128:$src1, VR128:$src1,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+  // Special binary v2i64 shuffle cases using SHUFPDrri.
+  def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+            (VSHUFPDrri VR128:$src1, VR128:$src2,
+                        (SHUFFLE_get_shuf_imm VR128:$src3))>;
+
+  def : Pat<(v2f64 (X86Shufps VR128:$src1,
+                       (memopv2f64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
+  def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+  def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+            (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+
+  // 256-bit patterns
+  def : Pat<(v8i32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8i32 (X86Shufps VR256:$src1,
+                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v8f32 (X86Shufps VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v8f32 (X86Shufps VR256:$src1,
+                              (memopv8f32 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4i64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4i64 (X86Shufpd VR256:$src1,
+                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+
+  def : Pat<(v4f64 (X86Shufpd VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+            (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
+  def : Pat<(v4f64 (X86Shufpd VR256:$src1,
+                              (memopv4f64 addr:$src2), (i8 imm:$imm))),
+            (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Unpack Instructions
 //===----------------------------------------------------------------------===//
@@ -1316,29 +2372,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, PatFrag OpNode, ValueType vt,
 let AddedComplexity = 10 in {
   defm VUNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
         VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKHPD: sse12_unpack_interleave<0x15, unpckh, v2f64, memopv2f64,
         VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
   defm VUNPCKLPS: sse12_unpack_interleave<0x14, unpckl, v4f32, memopv4f32,
         VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKLPD: sse12_unpack_interleave<0x14, unpckl, v2f64, memopv2f64,
         VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
 
   defm VUNPCKHPSY: sse12_unpack_interleave<0x15, unpckh, v8f32, memopv8f32,
         VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKHPDY: sse12_unpack_interleave<0x15, unpckh, v4f64, memopv4f64,
         VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
   defm VUNPCKLPSY: sse12_unpack_interleave<0x14, unpckl, v8f32, memopv8f32,
         VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedSingle>, VEX_4V;
+                       SSEPackedSingle>, TB, VEX_4V;
   defm VUNPCKLPDY: sse12_unpack_interleave<0x14, unpckl, v4f64, memopv4f64,
         VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                       SSEPackedDouble>, OpSize, VEX_4V;
+                       SSEPackedDouble>, TB, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm UNPCKHPS: sse12_unpack_interleave<0x15, unpckh, v4f32, memopv4f32,
@@ -1356,6 +2412,103 @@ let AddedComplexity = 10 in {
   } // Constraints = "$src1 = $dst"
 } // AddedComplexity
 
+let Predicates = [HasSSE1] in {
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+            (UNPCKLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+            (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+            (UNPCKHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+            (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (UNPCKLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+            (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (UNPCKHPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+            (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
+
+  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (UNPCKLPDrr VR128:$src, VR128:$src)>;
+
+  let AddedComplexity = 10 in
+  def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+            (UNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
+            (VUNPCKLPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
+            (VUNPCKLPSrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
+            (VUNPCKHPSrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
+            (VUNPCKHPSrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, (memopv8i32 addr:$src2))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, (memopv8f32 addr:$src2))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, (memopv8i32 addr:$src2))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (VUNPCKLPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
+            (VUNPCKLPDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
+            (VUNPCKHPDrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
+            (VUNPCKHPDrr VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, (memopv4f64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+
+  // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
+  // problem is during lowering, where it's not possible to recognize the load
+  // fold cause it has two uses through a bitcast. One use disappears at isel
+  // time and the fold opportunity reappears.
+  def : Pat<(v2f64 (X86Movddup VR128:$src)),
+            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+  let AddedComplexity = 10 in
+  def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+            (VUNPCKLPDrr VR128:$src, VR128:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Extract Floating-Point Sign mask
 //===----------------------------------------------------------------------===//
@@ -1370,91 +2523,60 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
                 !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>, REX_W;
 }
 
-// Mask creation
-defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
-                                      "movmskps", SSEPackedSingle>, VEX;
-defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
-                                      "movmskpd", SSEPackedDouble>, OpSize,
-                                      VEX;
-defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                      "movmskps", SSEPackedSingle>, VEX;
-defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
-                                      "movmskpd", SSEPackedDouble>, OpSize,
-                                      VEX;
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
                                      SSEPackedSingle>, TB;
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
-// X86fgetsign
-def MOVMSKPDrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
-                    "movmskpd\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
-def MOVMSKPDrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
-                    "movmskpd\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (X86fgetsign FR64:$src))], SSEPackedDouble>, TB, OpSize;
-def MOVMSKPSrr32_alt : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
-                    "movmskps\t{$src, $dst|$dst, $src}",
-                    [(set GR32:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
-def MOVMSKPSrr64_alt : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
-                    "movmskps\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (X86fgetsign FR32:$src))], SSEPackedSingle>, TB;
-
-// Assembler Only
-def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-           VEX;
-def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, VEX;
-def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, OpSize,
-           VEX;
-
-//===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Misc aliasing of packed SSE 1 & 2 instructions
-//===----------------------------------------------------------------------===//
-
-// Aliases of packed SSE1 & SSE2 instructions for scalar use. These all have
-// names that start with 'Fs'.
-
-// Alias instructions that map fld0 to pxor for sse.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
-    canFoldAsLoad = 1 in {
-  // FIXME: Set encoding to pseudo!
-def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
-                 [(set FR32:$dst, fp32imm0)]>,
-                 Requires<[HasSSE1]>, TB, OpSize;
-def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
-                 [(set FR64:$dst, fpimm0)]>,
-               Requires<[HasSSE2]>, TB, OpSize;
-def VFsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
-                  [(set FR32:$dst, fp32imm0)]>,
-                  Requires<[HasAVX]>, TB, OpSize, VEX_4V;
-def VFsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
-                  [(set FR64:$dst, fpimm0)]>,
-                  Requires<[HasAVX]>, TB, OpSize, VEX_4V;
-}
-
-// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
-// bits are disregarded.
-let neverHasSideEffects = 1 in {
-def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                     "movaps\t{$src, $dst|$dst, $src}", []>;
-def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                     "movapd\t{$src, $dst|$dst, $src}", []>;
-}
+def : Pat<(i32 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                       sub_ss))>, Requires<[HasSSE1]>;
+def : Pat<(i64 (X86fgetsign FR32:$src)),
+          (MOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                       sub_ss))>, Requires<[HasSSE1]>;
+def : Pat<(i32 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                       sub_sd))>, Requires<[HasSSE2]>;
+def : Pat<(i64 (X86fgetsign FR64:$src)),
+          (MOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                       sub_sd))>, Requires<[HasSSE2]>;
 
-// Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
-// bits are disregarded.
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
-def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
-                     "movaps\t{$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
-def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
-                     "movapd\t{$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+let Predicates = [HasAVX] in {
+  defm VMOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps,
+                                        "movmskps", SSEPackedSingle>, TB, VEX;
+  defm VMOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd,
+                                        "movmskpd", SSEPackedDouble>, TB,
+                                        OpSize, VEX;
+  defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
+                                        "movmskps", SSEPackedSingle>, TB, VEX;
+  defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
+                                        "movmskpd", SSEPackedDouble>, TB,
+                                        OpSize, VEX;
+
+  def : Pat<(i32 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                          sub_ss))>;
+  def : Pat<(i64 (X86fgetsign FR32:$src)),
+            (VMOVMSKPSrr64 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src,
+                                          sub_ss))>;
+  def : Pat<(i32 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr32 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                          sub_sd))>;
+  def : Pat<(i64 (X86fgetsign FR64:$src)),
+            (VMOVMSKPDrr64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src,
+                                          sub_sd))>;
+
+  // Assembler Only
+  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
+  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
+             OpSize, VEX;
+  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+             "movmskps\t{$src, $dst|$dst, $src}", [], SSEPackedSingle>, TB, VEX;
+  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
+             "movmskpd\t{$src, $dst|$dst, $src}", [], SSEPackedDouble>, TB,
+             OpSize, VEX;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1466,10 +2588,10 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
 multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
                                        SDNode OpNode> {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, VEX_4V;
+              FR32, f32, f128mem, memopfsf32, SSEPackedSingle, 0>, TB, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, OpSize, VEX_4V;
+        FR64, f64, f128mem, memopfsf64, SSEPackedDouble, 0>, TB, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, FR32,
@@ -1494,21 +2616,22 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
 ///
 multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                    SDNode OpNode> {
-  let Pattern = []<dag> in {
-    defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
-         !strconcat(OpcodeStr, "ps"), f128mem,
-         [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))],
-         [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                   (memopv2i64 addr:$src2)))], 0>, VEX_4V;
-
-    defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
-         !strconcat(OpcodeStr, "pd"), f128mem,
-         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                   (bc_v2i64 (v2f64 VR128:$src2))))],
-         [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                   (memopv2i64 addr:$src2)))], 0>,
-                                                   OpSize, VEX_4V;
-  }
+  // In AVX no need to add a pattern for 128-bit logical rr ps, because they
+  // are all promoted to v2i64, and the patterns are covered by the int
+  // version. This is needed in SSE only, because v2i64 isn't supported on
+  // SSE1, but only on SSE2.
+  defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
+       !strconcat(OpcodeStr, "ps"), f128mem, [],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
+                                 (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+
+  defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
+       !strconcat(OpcodeStr, "pd"), f128mem,
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (bc_v2i64 (v2f64 VR128:$src2))))],
+       [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
+                                 (memopv2i64 addr:$src2)))], 0>,
+                                                 TB, OpSize, VEX_4V;
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem,
@@ -1533,7 +2656,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
           !strconcat(OpcodeStr, "ps"), f256mem,
           [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                                    (memopv4i64 addr:$src2)))], 0>, VEX_4V;
+                                    (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V;
 
     defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
           !strconcat(OpcodeStr, "pd"), f256mem,
@@ -1541,7 +2664,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
                                     (bc_v4i64 (v4f64 VR256:$src2))))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                     (memopv4i64 addr:$src2)))], 0>,
-                                    OpSize, VEX_4V;
+                                    TB, OpSize, VEX_4V;
 }
 
 // AVX 256-bit packed logical ops forms
@@ -1632,32 +2755,32 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr> {
 
 // Binary Arithmetic instructions
 defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, 0>,
-            basic_sse12_fp_binop_s_int<0x58, "add", 0>,
-            basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
+            basic_sse12_fp_binop_s_int<0x58, "add", 0>, VEX_4V, VEX_LIG;
+defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, 0>,
             basic_sse12_fp_binop_p_y<0x58, "add", fadd>, VEX_4V;
 defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, 0>,
-            basic_sse12_fp_binop_s_int<0x59, "mul", 0>,
-            basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
+            basic_sse12_fp_binop_s_int<0x59, "mul", 0>, VEX_4V, VEX_LIG;
+defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, 0>,
             basic_sse12_fp_binop_p_y<0x59, "mul", fmul>, VEX_4V;
 
 let isCommutable = 0 in {
   defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, 0>,
-              basic_sse12_fp_binop_s_int<0x5C, "sub", 0>,
-              basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
+              basic_sse12_fp_binop_s_int<0x5C, "sub", 0>, VEX_4V, VEX_LIG;
+  defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, 0>,
               basic_sse12_fp_binop_p_y<0x5C, "sub", fsub>, VEX_4V;
   defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, 0>,
-              basic_sse12_fp_binop_s_int<0x5E, "div", 0>,
-              basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
+              basic_sse12_fp_binop_s_int<0x5E, "div", 0>, VEX_4V, VEX_LIG;
+  defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, 0>,
               basic_sse12_fp_binop_p_y<0x5E, "div", fdiv>, VEX_4V;
   defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, 0>,
-              basic_sse12_fp_binop_s_int<0x5F, "max", 0>,
-              basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
+              basic_sse12_fp_binop_s_int<0x5F, "max", 0>, VEX_4V, VEX_LIG;
+  defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, 0>,
               basic_sse12_fp_binop_p_int<0x5F, "max", 0>,
               basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax>,
               basic_sse12_fp_binop_p_y_int<0x5F, "max">, VEX_4V;
   defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, 0>,
-              basic_sse12_fp_binop_s_int<0x5D, "min", 0>,
-              basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
+              basic_sse12_fp_binop_s_int<0x5D, "min", 0>, VEX_4V, VEX_LIG;
+  defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, 0>,
               basic_sse12_fp_binop_p_int<0x5D, "min", 0>,
               basic_sse12_fp_binop_p_y_int<0x5D, "min">,
               basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin>, VEX_4V;
@@ -1720,23 +2843,18 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
 }
 
 /// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
-multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
-                              SDNode OpNode, Intrinsic F32Int> {
+multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f32mem:$src2),
-                !strconcat(OpcodeStr,
-                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                []>, XS, Requires<[HasAVX, OptForSize]>;
-  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+  let mayLoad = 1 in
+  def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
                 !strconcat(OpcodeStr,
-                           "ss\t{$src, $dst, $dst|$dst, $dst, $src}"),
-                [(set VR128:$dst, (F32Int VR128:$src))]>;
-  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                (ins ssmem:$src1, VR128:$src2),
                 !strconcat(OpcodeStr,
-                           "ss\t{$src, $dst, $dst|$dst, $dst, $src}"),
-                [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+                           "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -1801,21 +2919,17 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
 }
 
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
-multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr,
-                              SDNode OpNode, Intrinsic F64Int> {
+multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
-               (ins FR64:$src1, f64mem:$src2),
+  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, sdmem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-           !strconcat(OpcodeStr, "sd\t{$src, $dst, $dst|$dst, $dst, $src}"),
-           [(set VR128:$dst, (F64Int VR128:$src))]>;
-  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
-           !strconcat(OpcodeStr, "sd\t{$src, $dst, $dst|$dst, $dst, $src}"),
-           [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -1863,9 +2977,8 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
 
 let Predicates = [HasAVX] in {
   // Square root.
-  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse_sqrt_ss>,
-                sse2_fp_unop_s_avx<0x51, "vsqrt", fsqrt, int_x86_sse2_sqrt_sd>,
-                VEX_4V;
+  defm VSQRT  : sse1_fp_unop_s_avx<0x51, "vsqrt">,
+                sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG;
 
   defm VSQRT  : sse1_fp_unop_p<0x51, "vsqrt", fsqrt>,
                 sse2_fp_unop_p<0x51, "vsqrt", fsqrt>,
@@ -1879,21 +2992,76 @@ let Predicates = [HasAVX] in {
 
   // Reciprocal approximations. Note that these typically require refinement
   // in order to obtain suitable precision.
-  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt", X86frsqrt,
-                                   int_x86_sse_rsqrt_ss>, VEX_4V;
+  defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG;
   defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt>,
                 sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt>,
                 sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256>,
                 sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps>, VEX;
 
-  defm VRCP   : sse1_fp_unop_s_avx<0x53, "vrcp", X86frcp, int_x86_sse_rcp_ss>,
-                                   VEX_4V;
+  defm VRCP   : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG;
   defm VRCP   : sse1_fp_unop_p<0x53, "vrcp", X86frcp>,
                 sse1_fp_unop_p_y<0x53, "vrcp", X86frcp>,
                 sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256>,
                 sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps>, VEX;
 }
 
+def : Pat<(f32 (fsqrt FR32:$src)),
+          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (fsqrt (load addr:$src))),
+          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+def : Pat<(f64 (fsqrt FR64:$src)),
+          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+def : Pat<(f64 (fsqrt (load addr:$src))),
+          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frsqrt FR32:$src)),
+          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frsqrt (load addr:$src))),
+          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+def : Pat<(f32 (X86frcp FR32:$src)),
+          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(f32 (X86frcp (load addr:$src))),
+          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasAVX, OptForSize]>;
+
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+                (VSQRTSSr (f32 (IMPLICIT_DEF)),
+                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
+                sub_ss)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128:$src),
+            (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
+                (VSQRTSDr (f64 (IMPLICIT_DEF)),
+                          (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd)),
+                sub_sd)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+                (VRSQRTSSr (f32 (IMPLICIT_DEF)),
+                          (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
+                sub_ss)>;
+  def : Pat<(int_x86_sse_rsqrt_ss sse_load_f32:$src),
+            (VRSQRTSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+                (VRCPSSr (f32 (IMPLICIT_DEF)),
+                         (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)),
+                sub_ss)>;
+  def : Pat<(int_x86_sse_rcp_ss sse_load_f32:$src),
+            (VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+
 // Square root.
 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss>,
              sse1_fp_unop_p<0x51, "sqrt",  fsqrt>,
@@ -1992,7 +3160,7 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
 
 def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>;
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
 
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
@@ -2006,7 +3174,7 @@ def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 }
 
 //===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Misc Instructions (No AVX form)
+// SSE 1 & 2 - Prefetch and memory fence
 //===----------------------------------------------------------------------===//
 
 // Prefetch intrinsic.
@@ -2019,66 +3187,26 @@ def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
 def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
     "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))]>;
 
-// Load, store, and memory fence
-def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
-             TB, Requires<[HasSSE1]>;
-def : Pat<(X86SFence), (SFENCE)>;
-
-// Alias instructions that map zero vector to pxor / xorp* for sse.
-// We set canFoldAsLoad because this can be converted to a constant-pool
-// load of an all-zeros value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementation, it does not expand the instructions below like
-// X86MCInstLower does.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in {
-def V_SET0PS : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
-def V_SET0PD : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v2f64 immAllZerosV))]>;
-let ExeDomain = SSEPackedInt in
-def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
-                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
-}
-
-// The same as done above but for AVX. The 128-bit versions are the
-// same, but re-encoded. The 256-bit does not support PI version, and
-// doesn't need it because on sandy bridge the register is set to zero
-// at the rename stage without using any execution unit, so SET0PSY
-// and SET0PDY can be used for vector int instructions without penalty
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementatioan, it does not expand the instructions below like
-// X86MCInstLower does.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, Predicates = [HasAVX] in {
-def AVX_SET0PS  : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                   [(set VR128:$dst, (v4f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PD  : PDI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
-                   [(set VR128:$dst, (v2f64 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
-let ExeDomain = SSEPackedInt in
-def AVX_SET0PI  : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
-                   [(set VR128:$dst, (v4i32 immAllZerosV))]>;
-}
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+              TB, Requires<[HasSSE2]>;
 
-def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
-def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
-def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
+// Pause. This "instruction" is encoded as "rep; nop", so even though it
+// was introduced with SSE2, it's backward compatible.
+def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
 
-def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
-          (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss))>;
+// Load, store, and memory fence
+def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
+               "sfence", [(int_x86_sse_sfence)]>, TB, Requires<[HasSSE1]>;
+def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
 
-// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while
-// in the non-AVX version bits 127:64 aren't touched. Find a better way to
-// represent this instead of always zeroing SRC1. One possible solution is
-// to represent the instruction w/ something similar as the "$src1 = $dst"
-// constraint but without the tied operands.
-def : Pat<(extloadf32 addr:$src),
-          (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)), addr:$src)>,
-      Requires<[HasAVX, OptForSpeed]>;
+def : Pat<(X86SFence), (SFENCE)>;
+def : Pat<(X86LFence), (LFENCE)>;
+def : Pat<(X86MFence), (MFENCE)>;
 
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Load/Store XCSR register
@@ -2106,10 +3234,22 @@ def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
 }
-def VMOVDQUrr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
-def VMOVDQUYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                    "movdqu\t{$src, $dst|$dst, $src}", []>, XS, VEX;
+def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                    "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+
+// For Disassembler
+let isCodeGenOnly = 1 in {
+def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqa\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
+                        "movdqu\t{$src, $dst|$dst, $src}", []>, VEX;
+}
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -2147,6 +3287,16 @@ def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    []>, XS, Requires<[HasSSE2]>;
 
+// For Disassembler
+let isCodeGenOnly = 1 in {
+def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqa\t{$src, $dst|$dst, $src}", []>;
+
+def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       []>, XS, Requires<[HasSSE2]>;
+}
+
 let canFoldAsLoad = 1, mayLoad = 1 in {
 def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
@@ -2180,9 +3330,11 @@ def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 
 } // ExeDomain = SSEPackedInt
 
-def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>;
-def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
-          (VMOVDQUYmr addr:$dst, VR256:$src)>;
+let Predicates = [HasAVX] in {
+  def : Pat<(int_x86_avx_loadu_dq_256 addr:$src), (VMOVDQUYrm addr:$src)>;
+  def : Pat<(int_x86_avx_storeu_dq_256 addr:$dst, VR256:$src),
+            (VMOVDQUYmr addr:$dst, VR256:$src)>;
+}
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Arithmetic Instructions
@@ -2415,15 +3567,14 @@ let ExeDomain = SSEPackedInt in {
   def VPANDNrr : PDI<0xDF, MRMSrcReg,
                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
-                                              VR128:$src2)))]>, VEX_4V;
+                    [(set VR128:$dst,
+                          (v2i64 (X86andnp VR128:$src1, VR128:$src2)))]>,VEX_4V;
 
   def VPANDNrm : PDI<0xDF, MRMSrcMem,
                     (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                     "vpandn\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
-                                              (memopv2i64 addr:$src2))))]>,
-                                              VEX_4V;
+                    [(set VR128:$dst, (X86andnp VR128:$src1,
+                                            (memopv2i64 addr:$src2)))]>, VEX_4V;
 }
 }
 
@@ -2527,6 +3678,32 @@ let Predicates = [HasAVX] in {
                                     0>, VEX_4V;
   defm VPCMPGTD  : PDI_binop_rm_int<0x66, "vpcmpgtd", int_x86_sse2_pcmpgt_d, 0,
                                     0>, VEX_4V;
+
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+            (VPCMPEQBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+            (VPCMPEQWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+            (VPCMPEQDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+            (VPCMPGTBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+            (VPCMPGTWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+            (VPCMPGTDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTDrm VR128:$src1, addr:$src2)>;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -2538,31 +3715,33 @@ let Constraints = "$src1 = $dst" in {
   defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
 } // Constraints = "$src1 = $dst"
 
-def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
-          (PCMPEQBrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
-          (PCMPEQBrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
-          (PCMPEQWrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
-          (PCMPEQWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
-          (PCMPEQDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
-          (PCMPEQDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
-          (PCMPGTBrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
-          (PCMPGTBrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
-          (PCMPGTWrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
-          (PCMPGTWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
-          (PCMPGTDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
-          (PCMPGTDrm VR128:$src1, addr:$src2)>;
+let Predicates = [HasSSE2] in {
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+            (PCMPEQBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+            (PCMPEQBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+            (PCMPEQWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+            (PCMPEQWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+            (PCMPEQDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+            (PCMPEQDrm VR128:$src1, addr:$src2)>;
+
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+            (PCMPGTBrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+            (PCMPGTBrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+            (PCMPGTWrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+            (PCMPGTWrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+            (PCMPGTDrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+            (PCMPGTDrm VR128:$src1, addr:$src2)>;
+}
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Pack Instructions
@@ -2608,7 +3787,7 @@ def mi : Ii8<0x70, MRMSrcMem,
 
 let Predicates = [HasAVX] in {
   let AddedComplexity = 5 in
-  defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, OpSize,
+  defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize,
                                VEX;
 
   // SSE2 with ImmT == Imm8 and XS prefix.
@@ -2618,6 +3797,34 @@ let Predicates = [HasAVX] in {
   // SSE2 with ImmT == Imm8 and XD prefix.
   defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, pshuflw, bc_v8i16>, XD,
                                VEX;
+
+  let AddedComplexity = 5 in
+  def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+            (VPSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+  // Unary v4f32 shuffle with VPSHUF* in order to fold a load.
+  def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+            (VPSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (VPSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (VPSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
+            (VPSHUFHWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (VPSHUFHWmi addr:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
+            (VPSHUFLWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (VPSHUFLWmi addr:$src, imm:$imm)>;
 }
 
 let Predicates = [HasSSE2] in {
@@ -2629,6 +3836,34 @@ let Predicates = [HasSSE2] in {
 
   // SSE2 with ImmT == Imm8 and XD prefix.
   defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, pshuflw, bc_v8i16>, XD;
+
+  let AddedComplexity = 5 in
+  def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+            (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+  // Unary v4f32 shuffle with PSHUF* in order to fold a load.
+  def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+            (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>;
+
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
+                                   (i8 imm:$imm))),
+            (PSHUFDmi addr:$src1, imm:$imm)>;
+  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+            (PSHUFDri VR128:$src1, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
+            (PSHUFHWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (PSHUFHWmi addr:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
+            (PSHUFLWri VR128:$src, imm:$imm)>;
+  def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)),
+                               (i8 imm:$imm))),
+            (PSHUFLWmi addr:$src, imm:$imm)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -2637,71 +3872,69 @@ let Predicates = [HasSSE2] in {
 
 let ExeDomain = SSEPackedInt in {
 multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
-                       PatFrag unp_frag, PatFrag bc_frag, bit Is2Addr = 1> {
+                       SDNode OpNode, PatFrag bc_frag, bit Is2Addr = 1> {
   def rr : PDI<opc, MRMSrcReg,
       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set VR128:$dst, (vt (unp_frag VR128:$src1, VR128:$src2)))]>;
+      [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))]>;
   def rm : PDI<opc, MRMSrcMem,
       (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set VR128:$dst, (unp_frag VR128:$src1,
+      [(set VR128:$dst, (OpNode VR128:$src1,
                                   (bc_frag (memopv2i64
                                                addr:$src2))))]>;
 }
 
 let Predicates = [HasAVX] in {
-  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, unpckl, bc_v16i8,
-                                 0>, VEX_4V;
-  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, unpckl, bc_v8i16,
-                                 0>, VEX_4V;
-  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, unpckl, bc_v4i32,
-                                 0>, VEX_4V;
+  defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Punpcklwd,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq,
+                                 bc_v4i32, 0>, VEX_4V;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
   def VPUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>, VEX_4V;
+            (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
+                                                    VR128:$src2)))]>, VEX_4V;
   def VPUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
-                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                         "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1,
-                                         (memopv2i64 addr:$src2))))]>, VEX_4V;
-
-  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, unpckh, bc_v16i8,
-                                 0>, VEX_4V;
-  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, unpckh, bc_v8i16,
-                                 0>, VEX_4V;
-  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, unpckh, bc_v4i32,
-                                 0>, VEX_4V;
+            (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
+                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
+
+  defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw,
+                                 bc_v16i8, 0>, VEX_4V;
+  defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Punpckhwd,
+                                 bc_v8i16, 0>, VEX_4V;
+  defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq,
+                                 bc_v4i32, 0>, VEX_4V;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
   def VPUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>, VEX_4V;
+             (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
+                                                     VR128:$src2)))]>, VEX_4V;
   def VPUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1,
-                                         (memopv2i64 addr:$src2))))]>, VEX_4V;
+             (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
+                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, unpckl, bc_v16i8>;
-  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, unpckl, bc_v8i16>;
-  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, unpckl, bc_v4i32>;
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, bc_v16i8>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, bc_v8i16>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq, bc_v4i32>;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
@@ -2709,17 +3942,17 @@ let Constraints = "$src1 = $dst" in {
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpcklqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
+                          (v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)))]>;
   def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
                          (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                          "punpcklqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckl VR128:$src1,
+                          (v2i64 (X86Punpcklqdq VR128:$src1,
                                          (memopv2i64 addr:$src2))))]>;
 
-  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, unpckh, bc_v16i8>;
-  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, unpckh, bc_v8i16>;
-  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, unpckh, bc_v4i32>;
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw, bc_v16i8>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd, bc_v8i16>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq, bc_v4i32>;
 
   /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
   /// knew to collapse (bitconvert VT to VT) into its operand.
@@ -2727,17 +3960,24 @@ let Constraints = "$src1 = $dst" in {
                          (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                          "punpckhqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
+                          (v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)))]>;
   def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
                         "punpckhqdq\t{$src2, $dst|$dst, $src2}",
                         [(set VR128:$dst,
-                          (v2i64 (unpckh VR128:$src1,
+                          (v2i64 (X86Punpckhqdq VR128:$src1,
                                          (memopv2i64 addr:$src2))))]>;
 }
-
 } // ExeDomain = SSEPackedInt
 
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+  def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+            (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+  def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+            (VPUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasAVX]>;
+}
+
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Extract and Insert
 //===---------------------------------------------------------------------===//
@@ -2769,7 +4009,7 @@ def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))]>, OpSize, VEX;
+                                                imm:$src2))]>, TB, OpSize, VEX;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2778,11 +4018,11 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
 
 // Insert
 let Predicates = [HasAVX] in {
-  defm VPINSRW : sse2_pinsrw<0>, OpSize, VEX_4V;
+  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
   def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
        "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-       []>, OpSize, VEX_4V;
+       []>, TB, OpSize, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in
@@ -2839,7 +4079,9 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 // SSE2 - Move Doubleword
 //===---------------------------------------------------------------------===//
 
+//===---------------------------------------------------------------------===//
 // Move Int Doubleword to Packed Double Int
+//
 def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2849,6 +4091,14 @@ def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>,
                       VEX;
+def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))]>, VEX;
+def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>, VEX;
+
 def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
@@ -2865,8 +4115,9 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))]>;
 
-
+//===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
+//
 def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert GR32:$src))]>, VEX;
@@ -2883,7 +4134,9 @@ def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
 
+//===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
+//
 def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
@@ -2902,22 +4155,48 @@ def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)]>;
 
-def MOVPQIto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+//===---------------------------------------------------------------------===//
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                           (iPTR 0)))]>,
+                      TB, OpSize, VEX, VEX_W, Requires<[HasAVX, In64BitMode]>;
+
+def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                                         (iPTR 0)))]>;
+
+//===---------------------------------------------------------------------===//
+// Bitcast FR64 <-> GR64
+//
+let Predicates = [HasAVX] in
+def VMOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                        "vmovq\t{$src, $dst|$dst, $src}",
+                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                        VEX;
+def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "mov{d|q}\t{$src, $dst|$dst, $src}",
-                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
-                                           (iPTR 0)))]>;
+                         [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+
 def MOV64toSDrm : S3SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
                        "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
 
-def MOVSDto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
-                        [(set GR64:$dst, (bitconvert FR64:$src))]>;
-def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
-                        "movq\t{$src, $dst|$dst, $src}",
-                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
-
+//===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
+//
 def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set GR32:$dst, (bitconvert FR32:$src))]>, VEX;
@@ -2931,7 +4210,9 @@ def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
 
-// movd / movq to XMM register zero-extends
+//===---------------------------------------------------------------------===//
+// Patterns and instructions to describe movd/movq to XMM register zero-extends
+//
 let AddedComplexity = 15 in {
 def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                        "movd\t{$src, $dst|$dst, $src}",
@@ -2967,15 +4248,36 @@ def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                        [(set VR128:$dst,
                          (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
                                                    (loadi32 addr:$src))))))]>;
+}
 
-def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+let Predicates = [HasSSE2], AddedComplexity = 20 in {
+  def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
             (MOVZDI2PDIrm addr:$src)>;
-def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
-def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
 }
 
+let Predicates = [HasAVX] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+              (VMOVZDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVZDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVZDI2PDIrm addr:$src)>;
+  }
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                                (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                                (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
+}
+
 // These are the correct encodings of the instructions so that we know how to
 // read correct assembly, even though we continue to emit the wrong ones for
 // compatibility with Darwin's buggy assembler.
@@ -2996,7 +4298,9 @@ def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
 // SSE2 - Move Quadword
 //===---------------------------------------------------------------------===//
 
+//===---------------------------------------------------------------------===//
 // Move Quadword Int to Packed Quadword Int
+//
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -3008,7 +4312,9 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
                     Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
 
+//===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
+//
 def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
@@ -3018,10 +4324,9 @@ def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)]>;
 
-def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
-          (f64 (EXTRACT_SUBREG (v2f64 VR128:$src), sub_sd))>;
-
+//===---------------------------------------------------------------------===//
 // Store / copy lower 64-bits of a XMM register.
+//
 def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX;
@@ -3037,7 +4342,7 @@ def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))]>,
                      XS, VEX, Requires<[HasAVX]>;
 
-let AddedComplexity = 20 in {
+let AddedComplexity = 20 in
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -3045,15 +4350,27 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))]>,
                      XS, Requires<[HasSSE2]>;
 
-def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+let Predicates = [HasSSE2], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVZQI2PQIrm addr:$src)>;
-def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (MOVZQI2PQIrm addr:$src)>;
-def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+let Predicates = [HasAVX], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (VMOVZQI2PQIrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzload addr:$src)),
+            (VMOVZQI2PQIrm addr:$src)>;
 }
 
+//===---------------------------------------------------------------------===//
 // Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
 // IA32 document. movq xmm1, xmm2 does clear the high bits.
+//
 let AddedComplexity = 15 in
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
@@ -3077,9 +4394,21 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))]>,
                       XS, Requires<[HasSSE2]>;
+}
 
-def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
-            (MOVZPQILo2PQIrm addr:$src)>;
+let AddedComplexity = 20 in {
+  let Predicates = [HasSSE2] in {
+    def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+              (MOVZPQILo2PQIrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (MOVZPQILo2PQIrr VR128:$src)>;
+  }
+  let Predicates = [HasAVX] in {
+    def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+              (VMOVZPQILo2PQIrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+              (VMOVZPQILo2PQIrr VR128:$src)>;
+  }
 }
 
 // Instructions to match in the assembler
@@ -3102,37 +4431,6 @@ def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  "movq\t{$src, $dst|$dst, $src}", []>, XS;
 
 //===---------------------------------------------------------------------===//
-// SSE2 - Misc Instructions
-//===---------------------------------------------------------------------===//
-
-// Flush cache
-def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
-              TB, Requires<[HasSSE2]>;
-
-// Load, store, and memory fence
-def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
-               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
-def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
-               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
-def : Pat<(X86LFence), (LFENCE)>;
-def : Pat<(X86MFence), (MFENCE)>;
-
-
-// Pause. This "instruction" is encoded as "rep; nop", so even though it
-// was introduced with SSE2, it's backward compatible.
-def PAUSE : I<0x90, RawFrm, (outs), (ins), "pause", []>, REP;
-
-// Alias instructions that map zero vector to pxor / xorp* for sse.
-// We set canFoldAsLoad because this can be converted to a constant-pool
-// load of an all-ones value if folding it would be beneficial.
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
-  // FIXME: Change encoding to pseudo.
-  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
-
-//===---------------------------------------------------------------------===//
 // SSE3 - Conversion Instructions
 //===---------------------------------------------------------------------===//
 
@@ -3164,6 +4462,11 @@ def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
 
+def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+          (VCVTPD2DQYrr VR256:$src)>;
+def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+          (VCVTPD2DQYrm addr:$src)>;
+
 // Convert Packed DW Integers to Packed Double FP
 let Predicates = [HasAVX] in {
 def VCVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -3192,41 +4495,74 @@ def : Pat<(int_x86_avx_cvt_pd2dq_256 VR256:$src),
 def : Pat<(int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)),
           (VCVTPD2DQYrm addr:$src)>;
 
+def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
+          (VCVTDQ2PDYrr VR128:$src)>;
+def : Pat<(v4f64 (sint_to_fp (memopv4i32 addr:$src))),
+          (VCVTDQ2PDYrm addr:$src)>;
+
 //===---------------------------------------------------------------------===//
-// SSE3 - Move Instructions
+// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
-
-// Replicate Single FP
-multiclass sse3_replicate_sfp<bits<8> op, PatFrag rep_frag, string OpcodeStr> {
-def rr : S3SI<op, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set VR128:$dst, (v4f32 (rep_frag
-                                                VR128:$src, (undef))))]>;
-def rm : S3SI<op, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))]>;
+def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                      [(set VR128:$dst, (rep_frag
-                                         (memopv4f32 addr:$src), (undef)))]>;
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>;
 }
 
-multiclass sse3_replicate_sfp_y<bits<8> op, PatFrag rep_frag,
-                                string OpcodeStr> {
-def rr : S3SI<op, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
-def rm : S3SI<op, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
-              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+let Predicates = [HasAVX] in {
+  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+  defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+  defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+}
+defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
+                                   memopv4f32, f128mem>;
+
+let Predicates = [HasSSE3] in {
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (MOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (MOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (MOVSLDUPrm addr:$src)>;
 }
 
 let Predicates = [HasAVX] in {
-  // FIXME: Merge above classes when we have patterns for the ymm version
-  defm VMOVSHDUP  : sse3_replicate_sfp<0x16, movshdup, "vmovshdup">, VEX;
-  defm VMOVSLDUP  : sse3_replicate_sfp<0x12, movsldup, "vmovsldup">, VEX;
-  defm VMOVSHDUPY : sse3_replicate_sfp_y<0x16, movshdup, "vmovshdup">, VEX;
-  defm VMOVSLDUPY : sse3_replicate_sfp_y<0x12, movsldup, "vmovsldup">, VEX;
+  def : Pat<(v4i32 (X86Movshdup VR128:$src)),
+            (VMOVSHDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VMOVSHDUPrm addr:$src)>;
+  def : Pat<(v4i32 (X86Movsldup VR128:$src)),
+            (VMOVSLDUPrr VR128:$src)>;
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+            (VMOVSLDUPrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movshdup VR256:$src)),
+            (VMOVSHDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
+            (VMOVSHDUPYrm addr:$src)>;
+  def : Pat<(v8i32 (X86Movsldup VR256:$src)),
+            (VMOVSLDUPYrr VR256:$src)>;
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
+            (VMOVSLDUPYrm addr:$src)>;
 }
-defm MOVSHDUP : sse3_replicate_sfp<0x16, movshdup, "movshdup">;
-defm MOVSLDUP : sse3_replicate_sfp<0x12, movsldup, "movsldup">;
 
-// Replicate Double FP
+//===---------------------------------------------------------------------===//
+// SSE3 - Replicate Double FP - MOVDDUP
+//===---------------------------------------------------------------------===//
+
 multiclass sse3_replicate_dfp<string OpcodeStr> {
 def rr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -3238,23 +4574,90 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                                       (undef))))]>;
 }
 
+// FIXME: Merge with above classe when there're patterns for the ymm version
 multiclass sse3_replicate_dfp_y<string OpcodeStr> {
-def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    []>;
-def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
-                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                    []>;
+let Predicates = [HasAVX] in {
+  def rr  : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      []>;
+  def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      []>;
+  }
+}
+
+defm MOVDDUP : sse3_replicate_dfp<"movddup">;
+defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
+defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+
+let Predicates = [HasSSE3] in {
+  def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+                   (undef)),
+            (MOVDDUPrm addr:$src)>;
+  let AddedComplexity = 5 in {
+  def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (MOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+            (MOVDDUPrm addr:$src)>;
+  }
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
+            (MOVDDUPrm addr:$src)>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (MOVDDUPrm addr:$src)>;
 }
 
 let Predicates = [HasAVX] in {
-  // FIXME: Merge above classes when we have patterns for the ymm version
-  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+  def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+                   (undef)),
+            (VMOVDDUPrm addr:$src)>;
+  let AddedComplexity = 5 in {
+  def : Pat<(movddup (memopv2f64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+            (VMOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (memopv2i64 addr:$src), (undef)), (VMOVDDUPrm addr:$src)>;
+  def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+            (VMOVDDUPrm addr:$src)>;
+  }
+  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+  def : Pat<(X86Movddup (bc_v2f64
+                             (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
+
+  // 256-bit version
+  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4f64 (scalar_to_vector (loadf64 addr:$src)))),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
+            (VMOVDDUPYrm addr:$src)>;
+  def : Pat<(X86Movddup (v4f64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
+  def : Pat<(X86Movddup (v4i64 VR256:$src)),
+            (VMOVDDUPYrr VR256:$src)>;
 }
-defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
-// Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+// SSE3 - Move Unaligned Integer
+//===---------------------------------------------------------------------===//
+
 let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
@@ -3267,38 +4670,6 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
 
-def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
-                   (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
-// Several Move patterns
-let AddedComplexity = 5 in {
-def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-def : Pat<(movddup (memopv2i64 addr:$src), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
-          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
-}
-
-// vector_shuffle v1, <undef> <1, 1, 3, 3>
-let AddedComplexity = 15 in
-def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
-          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
-let AddedComplexity = 20 in
-def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
-          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
-// vector_shuffle v1, <undef> <0, 0, 2, 2>
-let AddedComplexity = 15 in
-  def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
-            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
-let AddedComplexity = 20 in
-  def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
-            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
-
 //===---------------------------------------------------------------------===//
 // SSE3 - Arithmetic
 //===---------------------------------------------------------------------===//
@@ -3344,62 +4715,58 @@ let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
 
 // Horizontal ops
 multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                   X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+                   X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
   def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
 
   def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
 }
 multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
-                  X86MemOperand x86memop, Intrinsic IntId, bit Is2Addr = 1> {
+                  X86MemOperand x86memop, SDNode OpNode, bit Is2Addr = 1> {
   def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, RC:$src2)))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))]>;
 
   def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
        !if(Is2Addr,
          !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
          !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-      [(set RC:$dst, (vt (IntId RC:$src1, (memop addr:$src2))))]>;
+      [(set RC:$dst, (vt (OpNode RC:$src1, (memop addr:$src2))))]>;
 }
 
 let Predicates = [HasAVX] in {
   defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                          int_x86_sse3_hadd_ps, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                          int_x86_sse3_hadd_pd, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                          int_x86_sse3_hsub_ps, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                          int_x86_sse3_hsub_pd, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                          int_x86_avx_hadd_ps_256, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                          int_x86_avx_hadd_pd_256, 0>, VEX_4V;
+                          X86fhadd, 0>, VEX_4V;
   defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                          int_x86_avx_hsub_ps_256, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
   defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                          int_x86_avx_hsub_pd_256, 0>, VEX_4V;
+                          X86fhsub, 0>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem,
-                        int_x86_sse3_hadd_ps>;
-  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem,
-                       int_x86_sse3_hadd_pd>;
-  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem,
-                        int_x86_sse3_hsub_ps>;
-  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem,
-                       int_x86_sse3_hsub_pd>;
+  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
+  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
+  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -3466,7 +4833,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let Predicates = [HasAVX] in {
+let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm_int<0x01, "vphaddw", memopv8i16,
                                       int_x86_ssse3_phadd_w_128, 0>, VEX_4V;
@@ -3525,17 +4892,33 @@ defm PMULHRSW    : SS3I_binop_rm_int<0x0B, "pmulhrsw", memopv8i16,
                                      int_x86_ssse3_pmul_hr_sw_128>;
 }
 
-def : Pat<(X86pshufb VR128:$src, VR128:$mask),
-          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
-def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
-          (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+let Predicates = [HasSSSE3] in {
+  def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+            (PSHUFBrr128 VR128:$src, VR128:$mask)>;
+  def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+            (PSHUFBrm128 VR128:$src, addr:$mask)>;
+
+  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
+            (PSIGNBrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
+            (PSIGNWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
+            (PSIGNDrr128 VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+            (VPSHUFBrr128 VR128:$src, VR128:$mask)>;
+  def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+            (VPSHUFBrm128 VR128:$src, addr:$mask)>;
 
-def : Pat<(X86psignb VR128:$src1, VR128:$src2),
-          (PSIGNBrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
-def : Pat<(X86psignw VR128:$src1, VR128:$src2),
-          (PSIGNWrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
-def : Pat<(X86psignd VR128:$src1, VR128:$src2),
-          (PSIGNDrr128 VR128:$src1, VR128:$src2)>, Requires<[HasSSSE3]>;
+  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
+            (VPSIGNBrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
+            (VPSIGNWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
+            (VPSIGNDrr128 VR128:$src1, VR128:$src2)>;
+}
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Align Instruction Patterns
@@ -3560,33 +4943,35 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
 
 let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
-let Constraints = "$src1 = $dst" in
+let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
   defm PALIGN : ssse3_palign<"palignr">;
 
-let AddedComplexity = 5 in {
-def : Pat<(v4i32 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
-def : Pat<(v4f32 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
-def : Pat<(v8i16 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
-def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
-          (PALIGNR128rr VR128:$src2, VR128:$src1,
-                        (SHUFFLE_get_palign_imm VR128:$src3))>,
-      Requires<[HasSSSE3]>;
+let Predicates = [HasSSSE3] in {
+def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+}
+
+let Predicates = [HasAVX] in {
+def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+          (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
 //===---------------------------------------------------------------------===//
-// SSSE3 Misc Instructions
+// SSSE3 - Thread synchronization
 //===---------------------------------------------------------------------===//
 
-// Thread synchronization
 let usesCustomInserter = 1 in {
 def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
                 [(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>;
@@ -3609,338 +4994,6 @@ def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
 def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
       Requires<[In64BitMode]>;
 
-//===---------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//===---------------------------------------------------------------------===//
-
-// extload f32 -> f64.  This matches load+fextend because we have a hack in
-// the isel (PreprocessForFPConvert) that can introduce loads after dag
-// combine.
-// Since these loads aren't folded into the fextend, we have to match it
-// explicitly here.
-let Predicates = [HasSSE2] in
- def : Pat<(fextend (loadf32 addr:$src)),
-           (CVTSS2SDrm addr:$src)>;
-
-// FIXME: According to the intel manual, DEST[127:64] <- SRC1[127:64], while
-// in the non-AVX version bits 127:64 aren't touched. Find a better way to
-// represent this instead of always zeroing SRC1. One possible solution is
-// to represent the instruction w/ something similar as the "$src1 = $dst"
-// constraint but without the tied operands.
-let Predicates = [HasAVX] in
- def : Pat<(fextend (loadf32 addr:$src)),
-           (VCVTSS2SDrm (f32 (EXTRACT_SUBREG (AVX_SET0PS), sub_ss)),
-                        addr:$src)>;
-
-// bit_convert
-let Predicates = [HasXMMInt] in {
-  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
-  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
-}
-
-let Predicates = [HasAVX] in {
-  def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
-  def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
-  def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
-  def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
-  def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
-  def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
-  def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
-  def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
-  def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
-  def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
-}
-
-// Move scalar to XMM zero-extended
-// movd to XMM register zero-extends
-let AddedComplexity = 15 in {
-// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
-def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
-          (MOVSDrr (v2f64 (V_SET0PS)), FR64:$src)>;
-def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
-          (MOVSSrr (v4f32 (V_SET0PS)), FR32:$src)>;
-def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
-          (MOVSSrr (v4f32 (V_SET0PS)),
-                   (f32 (EXTRACT_SUBREG (v4f32 VR128:$src), sub_ss)))>;
-def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
-          (MOVSSrr (v4i32 (V_SET0PI)),
-                   (EXTRACT_SUBREG (v4i32 VR128:$src), sub_ss))>;
-}
-
-// Splat v2f64 / v2i64
-let AddedComplexity = 10 in {
-def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
-          (UNPCKLPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
-def : Pat<(unpckh (v2f64 VR128:$src), (undef)),
-          (UNPCKHPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
-def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
-          (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
-def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
-          (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
-}
-
-// Special unary SHUFPSrri case.
-def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPSrri VR128:$src1, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>;
-let AddedComplexity = 5 in
-def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
-          (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-      Requires<[HasSSE2]>;
-// Special unary SHUFPDrri case.
-def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPDrri VR128:$src1, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-      Requires<[HasSSE2]>;
-// Special unary SHUFPDrri case.
-def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
-          (SHUFPDrri VR128:$src1, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-      Requires<[HasSSE2]>;
-// Unary v4f32 shuffle with PSHUF* in order to fold a load.
-def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
-          (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-      Requires<[HasSSE2]>;
-
-// Special binary v4i32 shuffle cases with SHUFPS.
-def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
-          (SHUFPSrri VR128:$src1, VR128:$src2,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-           Requires<[HasSSE2]>;
-def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
-          (SHUFPSrmi VR128:$src1, addr:$src2,
-                    (SHUFFLE_get_shuf_imm VR128:$src3))>,
-           Requires<[HasSSE2]>;
-// Special binary v2i64 shuffle cases using SHUFPDrri.
-def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
-          (SHUFPDrri VR128:$src1, VR128:$src2,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
-          Requires<[HasSSE2]>;
-
-// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
-let AddedComplexity = 15 in {
-def : Pat<(v4i32 (unpckl_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-}
-let AddedComplexity = 10 in {
-def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
-          (UNPCKLPSrr VR128:$src, VR128:$src)>;
-def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLBWrr VR128:$src, VR128:$src)>;
-def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLWDrr VR128:$src, VR128:$src)>;
-def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
-          (PUNPCKLDQrr VR128:$src, VR128:$src)>;
-}
-
-// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
-let AddedComplexity = 15 in {
-def : Pat<(v4i32 (unpckh_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
-          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
-          Requires<[OptForSpeed, HasSSE2]>;
-}
-let AddedComplexity = 10 in {
-def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
-          (UNPCKHPSrr VR128:$src, VR128:$src)>;
-def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHBWrr VR128:$src, VR128:$src)>;
-def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHWDrr VR128:$src, VR128:$src)>;
-def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
-          (PUNPCKHDQrr VR128:$src, VR128:$src)>;
-}
-
-let AddedComplexity = 20 in {
-// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
-def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-
-// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
-def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
-          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-
-// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
-def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
-          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
-def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
-          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
-}
-
-let AddedComplexity = 20 in {
-// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
-def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-}
-
-// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
-def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
-                 addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
-
-let AddedComplexity = 15 in {
-// Setting the lowest element in the vector.
-def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4i32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
-def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2i64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
-
-// vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
-def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
-      Requires<[HasSSE2]>;
-def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
-      Requires<[HasSSE2]>;
-}
-
-// vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
-// fall back to this for SSE1)
-def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
-          (SHUFPSrri VR128:$src2, VR128:$src1,
-                     (SHUFFLE_get_shuf_imm VR128:$src3))>;
-
-// Set lowest element and zero upper elements.
-def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
-          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
-
-// vector -> vector casts
-def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
-          (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
-          (CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
-
-// Use movaps / movups for SSE integer load / store (one byte shorter).
-// The instructions selected below are then converted to MOVDQA/MOVDQU
-// during the SSE domain pass.
-let Predicates = [HasSSE1] in {
-  def : Pat<(alignedloadv4i32 addr:$src),
-            (MOVAPSrm addr:$src)>;
-  def : Pat<(loadv4i32 addr:$src),
-            (MOVUPSrm addr:$src)>;
-  def : Pat<(alignedloadv2i64 addr:$src),
-            (MOVAPSrm addr:$src)>;
-  def : Pat<(loadv2i64 addr:$src),
-            (MOVUPSrm addr:$src)>;
-
-  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (MOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (MOVUPSmr addr:$dst, VR128:$src)>;
-}
-
-// Use vmovaps/vmovups for AVX integer load/store.
-let Predicates = [HasAVX] in {
-  // 128-bit load/store
-  def : Pat<(alignedloadv4i32 addr:$src),
-            (VMOVAPSrm addr:$src)>;
-  def : Pat<(loadv4i32 addr:$src),
-            (VMOVUPSrm addr:$src)>;
-  def : Pat<(alignedloadv2i64 addr:$src),
-            (VMOVAPSrm addr:$src)>;
-  def : Pat<(loadv2i64 addr:$src),
-            (VMOVUPSrm addr:$src)>;
-
-  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-
-  // 256-bit load/store
-  def : Pat<(alignedloadv4i64 addr:$src),
-            (VMOVAPSYrm addr:$src)>;
-  def : Pat<(loadv4i64 addr:$src),
-            (VMOVUPSYrm addr:$src)>;
-  def : Pat<(alignedloadv8i32 addr:$src),
-            (VMOVAPSYrm addr:$src)>;
-  def : Pat<(loadv8i32 addr:$src),
-            (VMOVUPSYrm addr:$src)>;
-  def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v4i64 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v8i32 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Move with Sign/Zero Extend
 //===----------------------------------------------------------------------===//
@@ -3979,36 +5032,71 @@ defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
 defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
 defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
 
-// Common patterns involving scalar load.
-def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
-          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
-          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+let Predicates = [HasSSE41] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (PMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (PMOVSXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (PMOVSXDQrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
-          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
-          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (PMOVZXBWrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
-          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
-          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (PMOVZXWDrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
-          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
-          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (PMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load.
+  def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+            (VPMOVSXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+            (VPMOVSXWDrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
-          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
-          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+            (VPMOVSXDQrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
-          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
-          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+            (VPMOVZXBWrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+            (VPMOVZXWDrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+            (VPMOVZXDQrm addr:$src)>;
+}
 
 
 multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
@@ -4039,17 +5127,31 @@ defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
 defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
 defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
 
-// Common patterns involving scalar load
-def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
-          (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
-          (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;
+let Predicates = [HasSSE41] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVSXWQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (PMOVZXWQrm addr:$src)>;
+}
 
-def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
-          (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
-def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
-          (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVSXWQrm addr:$src)>;
 
+  def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+            (VPMOVZXWQrm addr:$src)>;
+}
 
 multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -4073,16 +5175,31 @@ defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
 
-// Common patterns involving scalar load
-def : Pat<(int_x86_sse41_pmovsxbq
-            (bitconvert (v4i32 (X86vzmovl
-                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-          (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;
+let Predicates = [HasSSE41] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVSXBQrm addr:$src)>;
+
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  // Common patterns involving scalar load
+  def : Pat<(int_x86_sse41_pmovsxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVSXBQrm addr:$src)>;
 
-def : Pat<(int_x86_sse41_pmovzxbq
-            (bitconvert (v4i32 (X86vzmovl
-                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
-          (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
+  def : Pat<(int_x86_sse41_pmovzxbq
+              (bitconvert (v4i32 (X86vzmovl
+                            (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Extract Instructions
@@ -4208,7 +5325,12 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
                                               imm:$src2))),
                  addr:$dst),
           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
-         Requires<[HasSSE41]>;
+          Requires<[HasSSE41]>;
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+          Requires<[HasAVX]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Insert Instructions
@@ -4297,7 +5419,7 @@ let Constraints = "$src1 = $dst" in
 // in the target vector.
 multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -4306,7 +5428,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
         (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
       OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
-      (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
@@ -4348,7 +5470,7 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
 
   // Vector intrinsic operation, mem
   def PSm : Ii8<opcps, MRMSrcMem,
-                    (outs RC:$dst), (ins f256mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
@@ -4366,7 +5488,7 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
 
   // Vector intrinsic operation, mem
   def PDm : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs RC:$dst), (ins f256mem:$src1, i32i8imm:$src2),
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
@@ -4501,14 +5623,14 @@ let Predicates = [HasAVX] in {
                                   int_x86_avx_round_pd_256>, VEX;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
-                                  int_x86_sse41_round_sd, 0>, VEX_4V;
+                                  int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
 
   // Instructions for the assembler
   defm VROUND  : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">,
                                         VEX;
   defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">,
                                         VEX;
-  defm VROUND  : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V;
+  defm VROUND  : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -4578,26 +5700,34 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
 // SSE4.1 - Misc Instructions
 //===----------------------------------------------------------------------===//
 
-def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                   "popcnt{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, (ctpop GR16:$src))]>, OpSize, XS;
-def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
-                   "popcnt{w}\t{$src, $dst|$dst, $src}",
-                   [(set GR16:$dst, (ctpop (loadi16 addr:$src)))]>, OpSize, XS;
-
-def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                   "popcnt{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (ctpop GR32:$src))]>, XS;
-def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-                   "popcnt{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, (ctpop (loadi32 addr:$src)))]>, XS;
-
-def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                    "popcnt{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (ctpop GR64:$src))]>, XS;
-def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-                    "popcnt{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, (ctpop (loadi64 addr:$src)))]>, XS;
+let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
+  def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+                     OpSize, XS;
+  def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                     "popcnt{w}\t{$src, $dst|$dst, $src}",
+                     [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
+                      (implicit EFLAGS)]>, OpSize, XS;
+
+  def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+                     XS;
+  def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                     "popcnt{l}\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
+                      (implicit EFLAGS)]>, XS;
+
+  def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+                      XS;
+  def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                      "popcnt{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
+                       (implicit EFLAGS)]>, XS;
+}
 
 
 
@@ -4666,6 +5796,11 @@ let Predicates = [HasAVX] in {
                                                          0>, VEX_4V;
   defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
                                                          0>, VEX_4V;
+
+  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+            (VPCMPEQQrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+            (VPCMPEQQrm VR128:$src1, addr:$src2)>;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4720,7 +5855,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  X86MemOperand x86memop, bit Is2Addr = 1> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
-        (ins RC:$src1, RC:$src2, i32i8imm:$src3),
+        (ins RC:$src1, RC:$src2, u32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -4729,7 +5864,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
         [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
         OpSize;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
-        (ins RC:$src1, x86memop:$src2, i32i8imm:$src3),
+        (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
         !if(Is2Addr,
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
@@ -4815,6 +5950,36 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
                                          memopv32i8, int_x86_avx_blendv_ps_256>;
 
+let Predicates = [HasAVX] in {
+  def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
+  def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
+                            (v8i32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
+                            (v8f32 VR256:$src2))),
+            (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
+                            (v4i64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+  def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
+                            (v4f64 VR256:$src2))),
+            (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
+}
+
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
@@ -4835,12 +6000,27 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   }
 }
 
-defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
-defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
-defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
-
-def : Pat<(X86pblendv VR128:$src1, VR128:$src2, XMM0),
-          (PBLENDVBrr0 VR128:$src1, VR128:$src2)>;
+defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
+defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+
+let Predicates = [HasSSE41] in {
+  def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
+                            (v16i8 VR128:$src2))),
+            (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
+                            (v4i32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
+                            (v4f32 VR128:$src2))),
+            (BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
+                            (v2i64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+  def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
+                            (v2f64 VR128:$src2))),
+            (BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
+}
 
 let Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -4876,9 +6056,16 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
 }
 
-let Predicates = [HasAVX] in
+let Predicates = [HasAVX] in {
   defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
                                      0>, VEX_4V;
+
+  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
+            (VPCMPGTQrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
+            (VPCMPGTQrm VR128:$src1, addr:$src2)>;
+}
+
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
 
@@ -5158,22 +6345,43 @@ let Constraints = "$src1 = $dst" in {
                          int_x86_aesni_aesdeclast>;
 }
 
-def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
-          (AESENCrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
-          (AESENCrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
-          (AESENCLASTrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
-          (AESENCLASTrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
-          (AESDECrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
-          (AESDECrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
-          (AESDECLASTrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
-          (AESDECLASTrm VR128:$src1, addr:$src2)>;
+let Predicates = [HasAES] in {
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
+            (AESENCrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
+            (AESENCrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
+            (AESENCLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
+            (AESENCLASTrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
+            (AESDECrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
+            (AESDECrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
+            (AESDECLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
+            (AESDECLASTrm VR128:$src1, addr:$src2)>;
+}
+
+let Predicates = [HasAVX, HasAES], AddedComplexity = 20 in {
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
+            (VAESENCrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
+            (VAESENCrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
+            (VAESENCLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
+            (VAESENCLASTrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
+            (VAESDECrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
+            (VAESDECrm VR128:$src1, addr:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
+            (VAESDECLASTrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
+            (VAESDECLASTrm VR128:$src1, addr:$src2)>;
+}
 
 // Perform the AES InvMixColumn Transformation
 let Predicates = [HasAVX, HasAES] in {
@@ -5288,8 +6496,10 @@ defm : pclmul_alias<"lqlq", 0x00>;
 // AVX Instructions
 //===----------------------------------------------------------------------===//
 
-
-// Load from memory and broadcast to all elements of the destination operand
+//===----------------------------------------------------------------------===//
+// VBROADCAST - Load from memory and broadcast to all elements of the
+//              destination operand
+//
 class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
                     X86MemOperand x86memop, Intrinsic Int> :
   AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
@@ -5305,7 +6515,26 @@ def VBROADCASTSD   : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
                                    int_x86_avx_vbroadcastf128_pd_256>;
 
-// Insert packed floating-point values
+def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
+          (VBROADCASTF128 addr:$src)>;
+
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSD addr:$src)>;
+def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSY addr:$src)>;
+def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSD addr:$src)>;
+
+def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSS addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSS addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// VINSERTF128 - Insert packed floating-point values
+//
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -5315,7 +6544,41 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, VEX_4V;
 
-// Extract packed floating-point values
+def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
+          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (i32 imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+//===----------------------------------------------------------------------===//
+// VEXTRACTF128 - Extract packed floating-point values
+//
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5325,7 +6588,41 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           []>, VEX;
 
-// Conditional SIMD Packed Loads and Stores
+def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
+          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
+
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4f32 (VEXTRACTF128rr
+                    (v8f32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2f64 (VEXTRACTF128rr
+                    (v4f64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v4i32 (VEXTRACTF128rr
+                    (v8i32 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v2i64 (VEXTRACTF128rr
+                    (v4i64 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v8i16 (VEXTRACTF128rr
+                    (v16i16 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+          (v16i8 (VEXTRACTF128rr
+                    (v32i8 VR256:$src1),
+                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+//===----------------------------------------------------------------------===//
+// VMASKMOV - Conditional SIMD Packed Loads and Stores
+//
 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
                           Intrinsic IntLd, Intrinsic IntLd256,
                           Intrinsic IntSt, Intrinsic IntSt256,
@@ -5363,7 +6660,9 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
                                  int_x86_avx_maskstore_pd_256,
                                  memopv2f64, memopv4f64>;
 
-// Permute Floating-Point Values
+//===----------------------------------------------------------------------===//
+// VPERMIL - Permute Single and Double Floating-Point Values
+//
 multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
                       RegisterClass RC, X86MemOperand x86memop_f,
                       X86MemOperand x86memop_i, PatFrag f_frag, PatFrag i_frag,
@@ -5404,6 +6703,18 @@ defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
                              int_x86_avx_vpermilvar_pd_256,
                              int_x86_avx_vpermil_pd_256>;
 
+def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4f64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+def : Pat<(v8i32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPSYri VR256:$src1, imm:$imm)>;
+def : Pat<(v4i64 (X86VPermilpdy VR256:$src1, (i8 imm:$imm))),
+          (VPERMILPDYri VR256:$src1, imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
+//
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -5413,65 +6724,6 @@ def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           []>, VEX_4V;
 
-// Zero All YMM registers
-def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                 [(int_x86_avx_vzeroall)]>, VEX, VEX_L, Requires<[HasAVX]>;
-
-// Zero Upper bits of YMM registers
-def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                   [(int_x86_avx_vzeroupper)]>, VEX, Requires<[HasAVX]>;
-
-def : Pat<(int_x86_avx_vinsertf128_pd_256 VR256:$src1, VR128:$src2, imm:$src3),
-          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vinsertf128_ps_256 VR256:$src1, VR128:$src2, imm:$src3),
-          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-def : Pat<(int_x86_avx_vinsertf128_si_256 VR256:$src1, VR128:$src2, imm:$src3),
-          (VINSERTF128rr VR256:$src1, VR128:$src2, imm:$src3)>;
-
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-
-def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4f32 (VEXTRACTF128rr
-                    (v8f32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2f64 (VEXTRACTF128rr
-                    (v4f64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
-          (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-
-def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
-          (VBROADCASTF128 addr:$src)>;
-
 def : Pat<(int_x86_avx_vperm2f128_ps_256 VR256:$src1, VR256:$src2, imm:$src3),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$src3)>;
 def : Pat<(int_x86_avx_vperm2f128_pd_256 VR256:$src1, VR256:$src2, imm:$src3),
@@ -5489,377 +6741,59 @@ def : Pat<(int_x86_avx_vperm2f128_si_256
                   VR256:$src1, (memopv8i32 addr:$src2), imm:$src3),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$src3)>;
 
+def : Pat<(v8f32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v8i32 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4i64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v32i8 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v16i16 (X86VPerm2f128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+
 //===----------------------------------------------------------------------===//
-// SSE Shuffle pattern fragments
-//===----------------------------------------------------------------------===//
+// VZERO - Zero YMM registers
+//
+let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+            YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
+  // Zero All YMM registers
+  def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
+                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L, Requires<[HasAVX]>;
 
-// This is part of a "work in progress" refactoring. The idea is that all
-// vector shuffles are going to be translated into target specific nodes and
-// directly matched by the patterns below (which can be changed along the way)
-// The AVX version of some but not all of them are described here, and more
-// should come in a near future.
-
-// Shuffle with PSHUFD instruction folding loads. The first two patterns match
-// SSE2 loads, which are always promoted to v2i64. The last one should match
-// the SSE1 case, where the only legal load is v4f32, but there is no PSHUFD
-// in SSE2, how does it ever worked? Anyway, the pattern will remain here until
-// we investigate further.
-def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
-                                 (i8 imm:$imm))),
-          (VPSHUFDmi addr:$src1, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv2i64 addr:$src1)),
-                                 (i8 imm:$imm))),
-          (PSHUFDmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i32 (X86PShufd (bc_v4i32 (memopv4f32 addr:$src1)),
-                                 (i8 imm:$imm))),
-          (PSHUFDmi addr:$src1, imm:$imm)>; // FIXME: has this ever worked?
-
-// Shuffle with PSHUFD instruction.
-def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (PSHUFDri VR128:$src1, imm:$imm)>;
-
-def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (VPSHUFDri VR128:$src1, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-          (PSHUFDri VR128:$src1, imm:$imm)>;
-
-// Shuffle with SHUFPD instruction.
-def : Pat<(v2f64 (X86Shufps VR128:$src1,
-                     (memopv2f64 addr:$src2), (i8 imm:$imm))),
-          (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Shufps VR128:$src1,
-                     (memopv2f64 addr:$src2), (i8 imm:$imm))),
-          (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v2i64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Shufpd VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-// Shuffle with SHUFPS instruction.
-def : Pat<(v4f32 (X86Shufps VR128:$src1,
-                     (memopv4f32 addr:$src2), (i8 imm:$imm))),
-          (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Shufps VR128:$src1,
-                     (memopv4f32 addr:$src2), (i8 imm:$imm))),
-          (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-def : Pat<(v4i32 (X86Shufps VR128:$src1,
-                     (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
-          (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86Shufps VR128:$src1,
-                     (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
-          (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
-
-def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>, Requires<[HasAVX]>;
-def : Pat<(v4i32 (X86Shufps VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
-// Shuffle with MOVHLPS instruction
-def : Pat<(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)),
-          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
-          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with MOVDDUP instruction
-def : Pat<(X86Movddup (memopv2f64 addr:$src)),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (memopv2f64 addr:$src)),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (v2f64 (scalar_to_vector (loadf64 addr:$src)))),
-          (MOVDDUPrm addr:$src)>;
-
-def : Pat<(X86Movddup (bc_v2f64
-                           (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-          (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-def : Pat<(X86Movddup (bc_v2f64
-                           (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
-          (MOVDDUPrm addr:$src)>;
-
-
-// Shuffle with UNPCKLPS
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
-          (VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
-          (VUNPCKLPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
-          (UNPCKLPSrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
-          (VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
-          (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
-          (UNPCKLPSrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with UNPCKHPS
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
-          (VUNPCKHPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, (memopv4f32 addr:$src2))),
-          (UNPCKHPSrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
-          (VUNPCKHPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
-          (UNPCKHPSrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with UNPCKLPD
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
-          (VUNPCKLPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (UNPCKLPDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
-          (VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
-          (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
-          (UNPCKLPDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with UNPCKHPD
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (VUNPCKHPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, (memopv2f64 addr:$src2))),
-          (UNPCKHPDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
-          (VUNPCKHPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
-def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
-          (UNPCKHPDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLBW
-def : Pat<(v16i8 (X86Punpcklbw VR128:$src1,
-                                   (bc_v16i8 (memopv2i64 addr:$src2)))),
-          (PUNPCKLBWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (X86Punpcklbw VR128:$src1, VR128:$src2)),
-          (PUNPCKLBWrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLWD
-def : Pat<(v8i16 (X86Punpcklwd VR128:$src1,
-                                   (bc_v8i16 (memopv2i64 addr:$src2)))),
-          (PUNPCKLWDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86Punpcklwd VR128:$src1, VR128:$src2)),
-          (PUNPCKLWDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLDQ
-def : Pat<(v4i32 (X86Punpckldq VR128:$src1,
-                                   (bc_v4i32 (memopv2i64 addr:$src2)))),
-          (PUNPCKLDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86Punpckldq VR128:$src1, VR128:$src2)),
-          (PUNPCKLDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKLQDQ
-def : Pat<(v2i64 (X86Punpcklqdq VR128:$src1, (memopv2i64 addr:$src2))),
-          (PUNPCKLQDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)),
-          (PUNPCKLQDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHBW
-def : Pat<(v16i8 (X86Punpckhbw VR128:$src1,
-                                   (bc_v16i8 (memopv2i64 addr:$src2)))),
-          (PUNPCKHBWrm VR128:$src1, addr:$src2)>;
-def : Pat<(v16i8 (X86Punpckhbw VR128:$src1, VR128:$src2)),
-          (PUNPCKHBWrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHWD
-def : Pat<(v8i16 (X86Punpckhwd VR128:$src1,
-                                   (bc_v8i16 (memopv2i64 addr:$src2)))),
-          (PUNPCKHWDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v8i16 (X86Punpckhwd VR128:$src1, VR128:$src2)),
-          (PUNPCKHWDrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHDQ
-def : Pat<(v4i32 (X86Punpckhdq VR128:$src1,
-                                   (bc_v4i32 (memopv2i64 addr:$src2)))),
-          (PUNPCKHDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86Punpckhdq VR128:$src1, VR128:$src2)),
-          (PUNPCKHDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with PUNPCKHQDQ
-def : Pat<(v2i64 (X86Punpckhqdq VR128:$src1, (memopv2i64 addr:$src2))),
-          (PUNPCKHQDQrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)),
-          (PUNPCKHQDQrr VR128:$src1, VR128:$src2)>;
-
-// Shuffle with MOVLHPS
-def : Pat<(X86Movlhps VR128:$src1,
-                    (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
-          (MOVHPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(X86Movlhps VR128:$src1,
-                    (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
-          (MOVHPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
-          (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
-
-// FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(v2f64 (X86Movddup VR128:$src)),
-          (UNPCKLPDrr VR128:$src, VR128:$src)>;
-
-// Shuffle with MOVLHPD
-def : Pat<(v2f64 (X86Movlhpd VR128:$src1,
-                    (scalar_to_vector (loadf64 addr:$src2)))),
-          (MOVHPDrm VR128:$src1, addr:$src2)>;
-
-// FIXME: Instead of X86Unpcklpd, there should be a X86Movlhpd here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(v2f64 (X86Unpcklpd VR128:$src1,
-                    (scalar_to_vector (loadf64 addr:$src2)))),
-          (MOVHPDrm VR128:$src1, addr:$src2)>;
-
-// Shuffle with MOVSS
-def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
-          (MOVSSrr VR128:$src1, FR32:$src2)>;
-def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4i32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
-def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
-          (MOVSSrr (v4f32 VR128:$src1),
-                   (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_ss))>;
-// FIXME: Instead of a X86Movss there should be a X86Movlps here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(X86Movss VR128:$src1,
-                    (bc_v4i32 (v2i64 (load addr:$src2)))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-
-// Shuffle with MOVSD
-def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
-          (MOVSDrr VR128:$src1, FR64:$src2)>;
-def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2i64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
-def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr (v2f64 VR128:$src1),
-                   (EXTRACT_SUBREG (v2f64 VR128:$src2), sub_sd))>;
-def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
-def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>;
-
-// Shuffle with MOVSHDUP
-def : Pat<(v4i32 (X86Movshdup VR128:$src)),
-          (MOVSHDUPrr VR128:$src)>;
-def : Pat<(X86Movshdup (bc_v4i32 (memopv2i64 addr:$src))),
-          (MOVSHDUPrm addr:$src)>;
-
-def : Pat<(v4f32 (X86Movshdup VR128:$src)),
-          (MOVSHDUPrr VR128:$src)>;
-def : Pat<(X86Movshdup (memopv4f32 addr:$src)),
-          (MOVSHDUPrm addr:$src)>;
-
-// Shuffle with MOVSLDUP
-def : Pat<(v4i32 (X86Movsldup VR128:$src)),
-          (MOVSLDUPrr VR128:$src)>;
-def : Pat<(X86Movsldup (bc_v4i32 (memopv2i64 addr:$src))),
-          (MOVSLDUPrm addr:$src)>;
-
-def : Pat<(v4f32 (X86Movsldup VR128:$src)),
-          (MOVSLDUPrr VR128:$src)>;
-def : Pat<(X86Movsldup (memopv4f32 addr:$src)),
-          (MOVSLDUPrm addr:$src)>;
-
-// Shuffle with PSHUFHW
-def : Pat<(v8i16 (X86PShufhw VR128:$src, (i8 imm:$imm))),
-          (PSHUFHWri VR128:$src, imm:$imm)>;
-def : Pat<(v8i16 (X86PShufhw (bc_v8i16 (memopv2i64 addr:$src)), (i8 imm:$imm))),
-          (PSHUFHWmi addr:$src, imm:$imm)>;
-
-// Shuffle with PSHUFLW
-def : Pat<(v8i16 (X86PShuflw VR128:$src, (i8 imm:$imm))),
-          (PSHUFLWri VR128:$src, imm:$imm)>;
-def : Pat<(v8i16 (X86PShuflw (bc_v8i16 (memopv2i64 addr:$src)), (i8 imm:$imm))),
-          (PSHUFLWmi addr:$src, imm:$imm)>;
-
-// Shuffle with PALIGN
-def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
-          (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
+  // Zero Upper bits of YMM registers
+  def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
+                     [(int_x86_avx_vzeroupper)]>, TB, VEX, Requires<[HasAVX]>;
+}
 
-// Shuffle with MOVLPS
-def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(X86Movlps VR128:$src1,
-                    (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
-          (MOVLPSrm VR128:$src1, addr:$src2)>;
-// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
-// is during lowering, where it's not possible to recognize the load fold cause
-// it has two uses through a bitcast. One use disappears at isel time and the
-// fold opportunity reappears.
-def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
-
-def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), 
-          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>; 
-
-// Shuffle with MOVLPD
-def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2f64 (X86Movlpd VR128:$src1,
-                            (scalar_to_vector (loadf64 addr:$src2)))),
-          (MOVLPDrm VR128:$src1, addr:$src2)>;
-
-// Extra patterns to match stores with MOVHPS/PD and MOVLPS/PD
-def : Pat<(store (f64 (vector_extract
-          (v2f64 (X86Unpckhps VR128:$src, (undef))), (iPTR 0))),addr:$dst),
-          (MOVHPSmr addr:$dst, VR128:$src)>;
-def : Pat<(store (f64 (vector_extract
-          (v2f64 (X86Unpckhpd VR128:$src, (undef))), (iPTR 0))),addr:$dst),
-          (MOVHPDmr addr:$dst, VR128:$src)>;
-
-def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v4i32 (X86Movlps
-                 (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
-          (MOVLPSmr addr:$src1, VR128:$src2)>;
-
-def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
-def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),addr:$src1),
-          (MOVLPDmr addr:$src1, VR128:$src2)>;
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//
+let Predicates = [HasAVX, HasF16C] in {
+  def VCVTPH2PSrm : I<0x13, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPH2PSrr : I<0x13, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPH2PSYrm : I<0x13, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPH2PSYrr : I<0x13, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
+                     "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8, OpSize, VEX;
+  def VCVTPS2PHmr : Ii8<0x1D, MRMDestMem, (outs f64mem:$dst),
+                      (ins VR128:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+  def VCVTPS2PHrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+                      (ins VR128:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+  def VCVTPS2PHYmr : Ii8<0x1D, MRMDestMem, (outs f128mem:$dst),
+                      (ins VR256:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+  def VCVTPS2PHYrr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
+                      (ins VR256:$src1, i32i8imm:$src2),
+                      "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+                      TA, OpSize, VEX;
+}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 31de878..05a5b36 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -67,43 +67,43 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", []>,
 //
 let Defs = [AL], Uses = [DX] in
 def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
-               "in{b}\t{%dx, %al|%AL, %DX}", []>;
+               "in{b}\t{%dx, %al|AL, DX}", []>;
 let Defs = [AX], Uses = [DX] in
 def IN16rr : I<0xED, RawFrm, (outs), (ins),
-               "in{w}\t{%dx, %ax|%AX, %DX}", []>,  OpSize;
+               "in{w}\t{%dx, %ax|AX, DX}", []>,  OpSize;
 let Defs = [EAX], Uses = [DX] in
 def IN32rr : I<0xED, RawFrm, (outs), (ins),
-               "in{l}\t{%dx, %eax|%EAX, %DX}", []>;
+               "in{l}\t{%dx, %eax|EAX, DX}", []>;
 
 let Defs = [AL] in
 def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
-                  "in{b}\t{$port, %al|%AL, $port}", []>;
+                  "in{b}\t{$port, %al|AL, $port}", []>;
 let Defs = [AX] in
 def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{w}\t{$port, %ax|%AX, $port}", []>, OpSize;
+                  "in{w}\t{$port, %ax|AX, $port}", []>, OpSize;
 let Defs = [EAX] in
 def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{l}\t{$port, %eax|%EAX, $port}", []>;
+                  "in{l}\t{$port, %eax|EAX, $port}", []>;
 
 let Uses = [DX, AL] in
 def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
-                "out{b}\t{%al, %dx|%DX, %AL}", []>;
+                "out{b}\t{%al, %dx|DX, AL}", []>;
 let Uses = [DX, AX] in
 def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{w}\t{%ax, %dx|%DX, %AX}", []>, OpSize;
+                "out{w}\t{%ax, %dx|DX, AX}", []>, OpSize;
 let Uses = [DX, EAX] in
 def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{l}\t{%eax, %dx|%DX, %EAX}", []>;
+                "out{l}\t{%eax, %dx|DX, EAX}", []>;
 
 let Uses = [AL] in
 def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
-                   "out{b}\t{%al, $port|$port, %AL}", []>;
+                   "out{b}\t{%al, $port|$port, AL}", []>;
 let Uses = [AX] in
 def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{w}\t{%ax, $port|$port, %AX}", []>, OpSize;
+                   "out{w}\t{%ax, $port|$port, AX}", []>, OpSize;
 let Uses = [EAX] in
 def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{l}\t{%eax, $port|$port, %EAX}", []>;
+                   "out{l}\t{%eax, $port|$port, EAX}", []>;
 
 def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", []>;
 def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", []>,  OpSize;
@@ -229,65 +229,65 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
              "ltr{w}\t{$src}", []>, TB;
              
 def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{w}\t%cs", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%cs|CS}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{l}\t%cs", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%cs|CS}", []>, Requires<[In32BitMode]>;
 def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
-                 "push{w}\t%ss", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ss|SS}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
-                 "push{l}\t%ss", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ss|SS}", []>, Requires<[In32BitMode]>;
 def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{w}\t%ds", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%ds|DS}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{l}\t%ds", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ds|DS}", []>, Requires<[In32BitMode]>;
 def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
-                 "push{w}\t%es", []>, Requires<[In32BitMode]>, OpSize;
+                 "push{w}\t{%es|ES}", []>, Requires<[In32BitMode]>, OpSize;
 def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
-                 "push{l}\t%es", []>, Requires<[In32BitMode]>;
+                 "push{l}\t{%es|ES}", []>, Requires<[In32BitMode]>;
                  
 def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{w}\t%fs", []>, OpSize, TB;
+                 "push{w}\t{%fs|FS}", []>, OpSize, TB;
 def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{l}\t%fs", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%fs|FS}", []>, TB, Requires<[In32BitMode]>;
 def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{w}\t%gs", []>, OpSize, TB;
+                 "push{w}\t{%gs|GS}", []>, OpSize, TB;
 def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{l}\t%gs", []>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%gs|GS}", []>, TB, Requires<[In32BitMode]>;
 
 def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{q}\t%fs", []>, TB;
+                 "push{q}\t{%fs|FS}", []>, TB;
 def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{q}\t%gs", []>, TB;
+                 "push{q}\t{%gs|GS}", []>, TB;
 
 // No "pop cs" instruction.
 def POPSS16 : I<0x17, RawFrm, (outs), (ins),
-                "pop{w}\t%ss", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ss|SS}", []>, OpSize, Requires<[In32BitMode]>;
 def POPSS32 : I<0x17, RawFrm, (outs), (ins),
-                "pop{l}\t%ss", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ss|SS}", []>        , Requires<[In32BitMode]>;
                 
 def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{w}\t%ds", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%ds|DS}", []>, OpSize, Requires<[In32BitMode]>;
 def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{l}\t%ds", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%ds|DS}", []>        , Requires<[In32BitMode]>;
                 
 def POPES16 : I<0x07, RawFrm, (outs), (ins),
-                "pop{w}\t%es", []>, OpSize, Requires<[In32BitMode]>;
+                "pop{w}\t{%es|ES}", []>, OpSize, Requires<[In32BitMode]>;
 def POPES32 : I<0x07, RawFrm, (outs), (ins),
-                "pop{l}\t%es", []>        , Requires<[In32BitMode]>;
+                "pop{l}\t{%es|ES}", []>        , Requires<[In32BitMode]>;
                 
 def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{w}\t%fs", []>, OpSize, TB;
+                "pop{w}\t{%fs|FS}", []>, OpSize, TB;
 def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{l}\t%fs", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%fs|FS}", []>, TB    , Requires<[In32BitMode]>;
 def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{q}\t%fs", []>, TB;
+                "pop{q}\t{%fs|FS}", []>, TB;
                 
 def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{w}\t%gs", []>, OpSize, TB;
+                "pop{w}\t{%gs|GS}", []>, OpSize, TB;
 def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{l}\t%gs", []>, TB    , Requires<[In32BitMode]>;
+                "pop{l}\t{%gs|GS}", []>, TB    , Requires<[In32BitMode]>;
 def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{q}\t%gs", []>, TB;
+                "pop{q}\t{%gs|GS}", []>, TB;
                  
 
 def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
@@ -400,12 +400,29 @@ def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
 def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
 def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
 
+//===----------------------------------------------------------------------===//
+// XSAVE instructions
 let Defs = [RDX, RAX], Uses = [RCX] in
   def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
 
 let Uses = [RDX, RAX, RCX] in
   def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB;
 
+let Uses = [RDX, RAX] in {
+  def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+               "xsave\t$dst", []>, TB;
+  def XSAVE64 : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins),
+                 "xsaveq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+               "xrstor\t$dst", []>, TB;
+  def XRSTOR64 : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+                 "xrstorq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+  def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                  "xsaveopt\t$dst", []>, TB;
+  def XSAVEOPT64 : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins),
+                    "xsaveoptq\t$dst", []>, TB, REX_W, Requires<[In64BitMode]>;
+}
+
 //===----------------------------------------------------------------------===//
 // VIA PadLock crypto instructions
 let Defs = [RAX, RDI], Uses = [RDX, RDI] in
@@ -427,3 +444,24 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
 }
 let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
   def MONTMUL : I<0xc0, RawFrm, (outs), (ins), "montmul", []>, A6;
+
+//===----------------------------------------------------------------------===//
+// FS/GS Base Instructions
+let Predicates = [In64BitMode] in {
+  def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
+                   "rdfsbase{l}\t$dst", []>, TB, XS;
+  def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
+                     "rdfsbase{q}\t$dst", []>, TB, XS;
+  def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
+                   "rdgsbase{l}\t$dst", []>, TB, XS;
+  def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
+                     "rdgsbase{q}\t$dst", []>, TB, XS;
+  def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$dst),
+                   "wrfsbase{l}\t$dst", []>, TB, XS;
+  def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$dst),
+                   "wrfsbase{q}\t$dst", []>, TB, XS;
+  def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$dst),
+                   "wrgsbase{l}\t$dst", []>, TB, XS;
+  def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$dst),
+                    "wrgsbase{q}\t$dst", []>, TB, XS;
+}
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index daf61e4..09a7a7d0c 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -16,9 +16,15 @@
 // VMX instructions
 
 // 66 0F 38 80
-def INVEPT : I<0x80, RawFrm, (outs), (ins), "invept", []>, OpSize, T8;
+def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
+def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+               "invept {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
 // 66 0F 38 81
-def INVVPID : I<0x81, RawFrm, (outs), (ins), "invvpid", []>, OpSize, T8;
+def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
+                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
+def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
+                "invvpid {$src2, $src1|$src1, $src2}", []>, OpSize, T8;
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
 def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index e385335..50bc14d 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -372,15 +372,10 @@ ReSimplify:
   case X86::FsFLD0SD:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::VFsFLD0SS:     LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
   case X86::VFsFLD0SD:     LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
-  case X86::V_SET0PS:      LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
-  case X86::V_SET0PD:      LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
-  case X86::V_SET0PI:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
-  case X86::AVX_SET0PS:    LowerUnaryToTwoAddr(OutMI, X86::VXORPSrr); break;
   case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
-  case X86::AVX_SET0PD:    LowerUnaryToTwoAddr(OutMI, X86::VXORPDrr); break;
   case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
-  case X86::AVX_SET0PI:    LowerUnaryToTwoAddr(OutMI, X86::VPXORrr); break;
+  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
@@ -468,6 +463,18 @@ ReSimplify:
   case X86::JLE_4: OutMI.setOpcode(X86::JLE_1); break;
   case X86::JG_4:  OutMI.setOpcode(X86::JG_1); break;
 
+  // Atomic load and store require a separate pseudo-inst because Acquire
+  // implies mayStore and Release implies mayLoad; fix these to regular MOV
+  // instructions here
+  case X86::ACQUIRE_MOV8rm:  OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
+  case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
+  case X86::RELEASE_MOV8mr:  OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
+  case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
+  case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
+  case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
+
   // We don't currently select the correct instruction form for instructions
   // which have a short %eax, etc. form. Handle this by custom lowering, for
   // now.
@@ -585,6 +592,8 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
 }
 
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  OutStreamer.EmitCodeRegion();
+
   X86MCInstLower MCInstLowering(Mang, *MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
@@ -601,7 +610,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (OutStreamer.hasRawTextSupport())
       OutStreamer.EmitRawText(StringRef("\t#MEMBARRIER"));
     return;
-        
+
 
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
index 06043ec..b0bb313 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -53,10 +53,6 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// relocation models.
   unsigned GlobalBaseReg;
 
-  /// ReserveFP - whether the function should reserve the frame pointer
-  /// when allocating, even if there may not actually be a frame pointer used.
-  bool ReserveFP;
-
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
   /// RegSaveFrameIndex - X86-64 vararg func register save area.
@@ -65,6 +61,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   unsigned VarArgsGPOffset;
   /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
   unsigned VarArgsFPOffset;
+  /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
+  /// being passed on the stack.
+  unsigned ArgumentStackSize;
 
 public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
@@ -77,7 +76,8 @@ public:
                              VarArgsFrameIndex(0),
                              RegSaveFrameIndex(0),
                              VarArgsGPOffset(0),
-                             VarArgsFPOffset(0) {}
+                             VarArgsFPOffset(0),
+                             ArgumentStackSize(0) {}
   
   explicit X86MachineFunctionInfo(MachineFunction &MF)
     : ForceFramePointer(false),
@@ -87,11 +87,11 @@ public:
       TailCallReturnAddrDelta(0),
       SRetReturnReg(0),
       GlobalBaseReg(0),
-      ReserveFP(false),
       VarArgsFrameIndex(0),
       RegSaveFrameIndex(0),
       VarArgsGPOffset(0),
-      VarArgsFPOffset(0) {}
+      VarArgsFPOffset(0),
+      ArgumentStackSize(0) {}
   
   bool getForceFramePointer() const { return ForceFramePointer;} 
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
@@ -114,9 +114,6 @@ public:
   unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
   void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
 
-  bool getReserveFP() const { return ReserveFP; }
-  void setReserveFP(bool reserveFP) { ReserveFP = reserveFP; }
-
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
 
@@ -128,6 +125,9 @@ public:
 
   unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
   void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
+
+  unsigned getArgumentStackSize() const { return ArgumentStackSize; }
+  void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index f2faf59..c1ac9f3 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -53,7 +52,13 @@ ForceStackAlign("force-align-stack",
 
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
                                  const TargetInstrInfo &tii)
-  : X86GenRegisterInfo(), TM(tm), TII(tii) {
+  : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit()
+                         ? X86::RIP : X86::EIP,
+                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
+                       X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true)),
+                       TM(tm), TII(tii) {
+  X86_MC::InitLLVM2SEHRegisterMapping(this);
+
   // Cache some information.
   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
   Is64Bit = Subtarget->is64Bit();
@@ -70,40 +75,6 @@ X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
   }
 }
 
-static unsigned getFlavour(const X86Subtarget *Subtarget, bool isEH) {
-  if (!Subtarget->is64Bit()) {
-    if (Subtarget->isTargetDarwin()) {
-      if (isEH)
-        return DWARFFlavour::X86_32_DarwinEH;
-      else
-        return DWARFFlavour::X86_32_Generic;
-    } else if (Subtarget->isTargetCygMing()) {
-      // Unsupported by now, just quick fallback
-      return DWARFFlavour::X86_32_Generic;
-    } else {
-      return DWARFFlavour::X86_32_Generic;
-    }
-  }
-  return DWARFFlavour::X86_64;
-}
-
-/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
-/// specific numbering, used in debug info and exception tables.
-int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  unsigned Flavour = getFlavour(Subtarget, isEH);
-
-  return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
-}
-
-/// getLLVMRegNum - This function maps DWARF register numbers to LLVM register.
-int X86RegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
-  unsigned Flavour = getFlavour(Subtarget, isEH);
-
-  return X86GenRegisterInfo::getLLVMRegNumFull(DwarfRegNo, Flavour);
-}
-
 /// getCompactUnwindRegNum - This function maps the register to the number for
 /// compact unwind encoding. Return -1 if the register isn't valid.
 int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
@@ -121,7 +92,7 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
 
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
-  int reg = getX86RegNum(i);
+  int reg = X86_MC::getX86RegNum(i);
   switch (i) {
   case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
   case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
@@ -140,96 +111,16 @@ X86RegisterInfo::getSEHRegNum(unsigned i) const {
   return reg;
 }
 
-/// getX86RegNum - This function maps LLVM register identifiers to their X86
-/// specific numbering, which is used in various places encoding instructions.
-unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
-  switch(RegNo) {
-  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
-  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
-  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
-  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
-  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
-    return N86::ESP;
-  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
-    return N86::EBP;
-  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
-    return N86::ESI;
-  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
-    return N86::EDI;
-
-  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-    return N86::EAX;
-  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-    return N86::ECX;
-  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-    return N86::EDX;
-  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-    return N86::EBX;
-  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-    return N86::ESP;
-  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-    return N86::EBP;
-  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-    return N86::ESI;
-  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-    return N86::EDI;
-
-  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
-  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
-    return RegNo-X86::ST0;
-
-  case X86::XMM0: case X86::XMM8:
-  case X86::YMM0: case X86::YMM8: case X86::MM0:
-    return 0;
-  case X86::XMM1: case X86::XMM9:
-  case X86::YMM1: case X86::YMM9: case X86::MM1:
-    return 1;
-  case X86::XMM2: case X86::XMM10:
-  case X86::YMM2: case X86::YMM10: case X86::MM2:
-    return 2;
-  case X86::XMM3: case X86::XMM11:
-  case X86::YMM3: case X86::YMM11: case X86::MM3:
-    return 3;
-  case X86::XMM4: case X86::XMM12:
-  case X86::YMM4: case X86::YMM12: case X86::MM4:
-    return 4;
-  case X86::XMM5: case X86::XMM13:
-  case X86::YMM5: case X86::YMM13: case X86::MM5:
-    return 5;
-  case X86::XMM6: case X86::XMM14:
-  case X86::YMM6: case X86::YMM14: case X86::MM6:
-    return 6;
-  case X86::XMM7: case X86::XMM15:
-  case X86::YMM7: case X86::YMM15: case X86::MM7:
-    return 7;
-
-  case X86::ES: return 0;
-  case X86::CS: return 1;
-  case X86::SS: return 2;
-  case X86::DS: return 3;
-  case X86::FS: return 4;
-  case X86::GS: return 5;
-
-  case X86::CR0: case X86::CR8 : case X86::DR0: return 0;
-  case X86::CR1: case X86::CR9 : case X86::DR1: return 1;
-  case X86::CR2: case X86::CR10: case X86::DR2: return 2;
-  case X86::CR3: case X86::CR11: case X86::DR3: return 3;
-  case X86::CR4: case X86::CR12: case X86::DR4: return 4;
-  case X86::CR5: case X86::CR13: case X86::DR5: return 5;
-  case X86::CR6: case X86::CR14: case X86::DR6: return 6;
-  case X86::CR7: case X86::CR15: case X86::DR7: return 7;
-
-  // Pseudo index registers are equivalent to a "none"
-  // scaled index (See Intel Manual 2A, table 2-3)
-  case X86::EIZ:
-  case X86::RIZ:
-    return 4;
-
-  default:
-    assert(isVirtualRegister(RegNo) && "Unknown physical register!");
-    llvm_unreachable("Register allocator hasn't allocated reg correctly yet!");
-    return 0;
-  }
+const TargetRegisterClass *
+X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
+                                       unsigned Idx) const {
+  // The sub_8bit sub-register index is more constrained in 32-bit mode.
+  // It behaves just like the sub_8bit_hi index.
+  if (!Is64Bit && Idx == X86::sub_8bit)
+    Idx = X86::sub_8bit_hi;
+
+  // Forward to TableGen's default version.
+  return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
 }
 
 const TargetRegisterClass *
@@ -355,8 +246,19 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
 
 const TargetRegisterClass*
 X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
+  // Don't allow super-classes of GR8_NOREX.  This class is only used after
+  // extrating sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
+  // to the full GR8 register class in 64-bit mode, so we cannot allow the
+  // reigster class inflation.
+  //
+  // The GR8_NOREX class is always used in a way that won't be constrained to a
+  // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
+  // full GR8 class.
+  if (RC == X86::GR8_NOREXRegisterClass)
+    return RC;
+
   const TargetRegisterClass *Super = RC;
-  TargetRegisterClass::sc_iterator I = RC->superclasses_begin();
+  TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
   do {
     switch (Super->getID()) {
     case X86::GR8RegClassID:
@@ -741,11 +643,6 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 }
 
-unsigned X86RegisterInfo::getRARegister() const {
-  return Is64Bit ? X86::RIP     // Should have dwarf #16.
-                 : X86::EIP;    // Should have dwarf #8.
-}
-
 unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
@@ -948,7 +845,7 @@ namespace {
       for (unsigned i = 0, e = RI.getNumVirtRegs(); i != e; ++i) {
         unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
         if (RI.getRegClass(Reg)->getAlignment() > StackAlignment) {
-          FuncInfo->setReserveFP(true);
+          FuncInfo->setForceFramePointer(true);
           return true;
         }
       }
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index a12eb12..7d39c68 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -24,22 +24,6 @@ namespace llvm {
   class TargetInstrInfo;
   class X86TargetMachine;
 
-/// N86 namespace - Native X86 register numbers
-///
-namespace N86 {
-  enum {
-    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
-  };
-}
-
-/// DWARFFlavour - Flavour of dwarf regnumbers
-///
-namespace DWARFFlavour {
-  enum {
-    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
-  };
-} 
-  
 class X86RegisterInfo : public X86GenRegisterInfo {
 public:
   X86TargetMachine &TM;
@@ -73,11 +57,6 @@ public:
   /// register identifier.
   static unsigned getX86RegNum(unsigned RegNo);
 
-  /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
-  /// (created by TableGen) for target dependencies.
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
-
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
 
@@ -95,6 +74,9 @@ public:
   getMatchingSuperRegClass(const TargetRegisterClass *A,
                            const TargetRegisterClass *B, unsigned Idx) const;
 
+  virtual const TargetRegisterClass *
+  getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const;
+
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
 
@@ -136,7 +118,6 @@ public:
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getStackRegister() const { return StackPtr; }
   // FIXME: Move to FrameInfok
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 203722a..9a7db36 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -390,6 +390,13 @@ def GR64_NOREX : RegisterClass<"X86", [i64], 64,
                        (GR32_NOREX sub_32bit)];
 }
 
+// GR32_NOAX - GR32 registers except EAX. Used by AddRegFrm of XCHG32 in 64-bit
+// mode to prevent encoding using the 0x90 NOP encoding. xchg %eax, %eax needs
+// to clear upper 32-bits of RAX so is not a NOP.
+def GR32_NOAX : RegisterClass<"X86", [i32], 32, (sub GR32, EAX)> {
+  let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
+}
+
 // GR32_NOSP - GR32 registers except ESP.
 def GR32_NOSP : RegisterClass<"X86", [i32], 32, (sub GR32, ESP)> {
   let SubRegClasses = [(GR8 sub_8bit, sub_8bit_hi), (GR16 sub_16bit)];
@@ -455,8 +462,8 @@ def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd)];
 }
 
-def VR256 : RegisterClass<"X86", [v32i8, v8i32, v4i64, v8f32, v4f64], 256,
-                          (sequence "YMM%u", 0, 15)> {
+def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          256, (sequence "YMM%u", 0, 15)> {
   let SubRegClasses = [(FR32 sub_ss), (FR64 sub_sd), (VR128 sub_xmm)];
 }
 
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 02754f9..6406bce 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -54,7 +54,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
       EVT IntPtr = TLI.getPointerTy();
-      const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+      Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 5e6c659..7064dd0 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -16,9 +16,11 @@
 #include "X86InstrInfo.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/SmallVector.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -185,24 +187,53 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
 
   X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
   
-  if ((EDX >> 15) & 1) HasCMov = true;      ToggleFeature(X86::FeatureCMOV);
-  if ((EDX >> 23) & 1) X86SSELevel = MMX;   ToggleFeature(X86::FeatureMMX);
-  if ((EDX >> 25) & 1) X86SSELevel = SSE1;  ToggleFeature(X86::FeatureSSE1);
-  if ((EDX >> 26) & 1) X86SSELevel = SSE2;  ToggleFeature(X86::FeatureSSE2);
-  if (ECX & 0x1)       X86SSELevel = SSE3;  ToggleFeature(X86::FeatureSSE3);
-  if ((ECX >> 9)  & 1) X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);
-  if ((ECX >> 19) & 1) X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);
-  if ((ECX >> 20) & 1) X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);
+  if ((EDX >> 15) & 1) { HasCMov = true;      ToggleFeature(X86::FeatureCMOV); }
+  if ((EDX >> 23) & 1) { X86SSELevel = MMX;   ToggleFeature(X86::FeatureMMX);  }
+  if ((EDX >> 25) & 1) { X86SSELevel = SSE1;  ToggleFeature(X86::FeatureSSE1); }
+  if ((EDX >> 26) & 1) { X86SSELevel = SSE2;  ToggleFeature(X86::FeatureSSE2); }
+  if (ECX & 0x1)       { X86SSELevel = SSE3;  ToggleFeature(X86::FeatureSSE3); }
+  if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
+  if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
+  if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
   // FIXME: AVX codegen support is not ready.
-  //if ((ECX >> 28) & 1) { HasAVX = true; } ToggleFeature(X86::FeatureAVX);
+  //if ((ECX >> 28) & 1) { HasAVX = true;  ToggleFeature(X86::FeatureAVX); }
 
   bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
   bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
 
-  HasCLMUL = IsIntel && ((ECX >> 1) & 0x1);   ToggleFeature(X86::FeatureCLMUL);
-  HasFMA3  = IsIntel && ((ECX >> 12) & 0x1);  ToggleFeature(X86::FeatureFMA3);
-  HasPOPCNT = IsIntel && ((ECX >> 23) & 0x1); ToggleFeature(X86::FeaturePOPCNT);
-  HasAES   = IsIntel && ((ECX >> 25) & 0x1);  ToggleFeature(X86::FeatureAES);
+  if (IsIntel && ((ECX >> 1) & 0x1)) {
+    HasCLMUL = true;
+    ToggleFeature(X86::FeatureCLMUL);
+  }
+  if (IsIntel && ((ECX >> 12) & 0x1)) {
+    HasFMA3 = true;
+    ToggleFeature(X86::FeatureFMA3);
+  }
+  if (IsIntel && ((ECX >> 22) & 0x1)) {
+    HasMOVBE = true;
+    ToggleFeature(X86::FeatureMOVBE);
+  }
+  if (IsIntel && ((ECX >> 23) & 0x1)) {
+    HasPOPCNT = true;
+    ToggleFeature(X86::FeaturePOPCNT);
+  }
+  if (IsIntel && ((ECX >> 25) & 0x1)) {
+    HasAES = true;
+    ToggleFeature(X86::FeatureAES);
+  }
+  if (IsIntel && ((ECX >> 29) & 0x1)) {
+    HasF16C = true;
+    ToggleFeature(X86::FeatureF16C);
+  }
+  if (IsIntel && ((ECX >> 30) & 0x1)) {
+    HasRDRAND = true;
+    ToggleFeature(X86::FeatureRDRAND);
+  }
+
+  if ((ECX >> 13) & 0x1) {
+    HasCmpxchg16b = true;
+    ToggleFeature(X86::FeatureCMPXCHG16B);
+  }
 
   if (IsIntel || IsAMD) {
     // Determine if bit test memory instructions are slow.
@@ -224,6 +255,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
       HasX86_64 = true;
       ToggleFeature(X86::Feature64Bit);
     }
+    if ((ECX >> 5) & 0x1) {
+      HasLZCNT = true;
+      ToggleFeature(X86::FeatureLZCNT);
+    }
     if (IsAMD && ((ECX >> 6) & 0x1)) {
       HasSSE4A = true;
       ToggleFeature(X86::FeatureSSE4A);
@@ -251,14 +286,21 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasCLMUL(false)
   , HasFMA3(false)
   , HasFMA4(false)
+  , HasMOVBE(false)
+  , HasRDRAND(false)
+  , HasF16C(false)
+  , HasLZCNT(false)
+  , HasBMI(false)
   , IsBTMemSlow(false)
   , IsUAMemFast(false)
   , HasVectorUAMem(false)
+  , HasCmpxchg16b(false)
   , stackAlignment(8)
   // FIXME: this is a known good value for Yonah. How about others?
   , MaxInlineSizeThreshold(128)
   , TargetTriple(TT)
-  , In64BitMode(is64Bit) {
+  , In64BitMode(is64Bit)
+  , InNaClMode(false) {
   // Determine default and user specified characteristics
   if (!FS.empty() || !CPU.empty()) {
     std::string CPUName = CPU;
@@ -304,6 +346,11 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   if (In64BitMode)
     ToggleFeature(X86::Mode64Bit);
 
+  if (isTargetNaCl()) {
+    InNaClMode = true;
+    ToggleFeature(X86::ModeNaCl);
+  }
+
   if (HasAVX)
     X86SSELevel = NoMMXSSE;
     
@@ -313,6 +360,9 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
+  if(EnableSegmentedStacks && !isTargetELF())
+    report_fatal_error("Segmented stacks are only implemented on ELF.");
+
   // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both
   // 32 and 64 bit) and for all 64-bit targets.
   if (StackAlignOverride)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 6d22027..3258d3d 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -90,6 +90,21 @@ protected:
   /// HasFMA4 - Target has 4-operand fused multiply-add
   bool HasFMA4;
 
+  /// HasMOVBE - True if the processor has the MOVBE instruction.
+  bool HasMOVBE;
+
+  /// HasRDRAND - True if the processor has the RDRAND instruction.
+  bool HasRDRAND;
+
+  /// HasF16C - Processor has 16-bit floating point conversion instructions.
+  bool HasF16C;
+
+  /// HasLZCNT - Processor has LZCNT instruction.
+  bool HasLZCNT;
+
+  /// HasBMI - Processor has BMI1 instructions.
+  bool HasBMI;
+
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -100,6 +115,10 @@ protected:
   /// operands. This may require setting a feature bit in the processor.
   bool HasVectorUAMem;
 
+  /// HasCmpxchg16b - True if this processor has the CMPXCHG16B instruction;
+  /// this is true for most x86-64 chips, but not the first AMD chips.
+  bool HasCmpxchg16b;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -115,6 +134,9 @@ private:
   /// In64BitMode - True if compiling for 64-bit, false for 32-bit.
   bool In64BitMode;
 
+  /// InNaClMode - True if compiling for Native Client target.
+  bool InNaClMode;
+
 public:
 
   /// This constructor initializes the data members to match that
@@ -165,9 +187,15 @@ public:
   bool hasCLMUL() const { return HasCLMUL; }
   bool hasFMA3() const { return HasFMA3; }
   bool hasFMA4() const { return HasFMA4; }
+  bool hasMOVBE() const { return HasMOVBE; }
+  bool hasRDRAND() const { return HasRDRAND; }
+  bool hasF16C() const { return HasF16C; }
+  bool hasLZCNT() const { return HasLZCNT; }
+  bool hasBMI() const { return HasBMI; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
+  bool hasCmpxchg16b() const { return HasCmpxchg16b; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -185,6 +213,11 @@ public:
     return !isTargetDarwin() && !isTargetWindows() && !isTargetCygMing();
   }
   bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetNaCl() const {
+    return TargetTriple.getOS() == Triple::NativeClient;
+  }
+  bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
+  bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
 
   bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
   bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
@@ -199,7 +232,8 @@ public:
   }
 
   bool isTargetWin64() const {
-    return In64BitMode && (isTargetMingw() || isTargetWindows());
+    // FIXME: x86_64-cygwin has not been released yet.
+    return In64BitMode && (isTargetCygMing() || isTargetWindows());
   }
 
   bool isTargetEnvMacho() const {
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 9cab0e0..15c6c4e 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -16,65 +16,32 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-static MCStreamer *createMCStreamer(const Target &T, const std::string &TT,
-                                    MCContext &Ctx, TargetAsmBackend &TAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return createMachOStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll);
-
-  if (TheTriple.isOSWindows())
-    return createWinCOFFStreamer(Ctx, TAB, *_Emitter, _OS, RelaxAll);
-
-  return createELFStreamer(Ctx, TAB, _OS, _Emitter, RelaxAll, NoExecStack);
-}
-
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
   RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
-
-  // Register the code emitter.
-  TargetRegistry::RegisterCodeEmitter(TheX86_32Target,
-                                      createX86MCCodeEmitter);
-  TargetRegistry::RegisterCodeEmitter(TheX86_64Target,
-                                      createX86MCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterAsmBackend(TheX86_32Target,
-                                     createX86_32AsmBackend);
-  TargetRegistry::RegisterAsmBackend(TheX86_64Target,
-                                     createX86_64AsmBackend);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterObjectStreamer(TheX86_32Target,
-                                         createMCStreamer);
-  TargetRegistry::RegisterObjectStreamer(TheX86_64Target,
-                                         createMCStreamer);
 }
 
 
-X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
-                                         const std::string &CPU,
-                                         const std::string &FS)
-  : X86TargetMachine(T, TT, CPU, FS, false),
+X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
+                                         Reloc::Model RM, CodeModel::Model CM)
+  : X86TargetMachine(T, TT, CPU, FS, RM, CM, false),
     DataLayout(getSubtargetImpl()->isTargetDarwin() ?
-               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-n8:16:32" :
+               "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
+               "n8:16:32-S128" :
                (getSubtargetImpl()->isTargetCygMing() ||
                 getSubtargetImpl()->isTargetWindows()) ?
-               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-n8:16:32" :
-               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-n8:16:32"),
+               "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-"
+               "n8:16:32-S32" :
+               "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-"
+               "n8:16:32-S128"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
@@ -82,11 +49,12 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
 }
 
 
-X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
-                                         const std::string &CPU, 
-                                         const std::string &FS)
-  : X86TargetMachine(T, TT, CPU, FS, true),
-    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-n8:16:32:64"),
+X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
+                                         StringRef CPU, StringRef FS,
+                                         Reloc::Model RM, CodeModel::Model CM)
+  : X86TargetMachine(T, TT, CPU, FS, RM, CM, true),
+    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+               "n8:16:32:64-S128"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
@@ -95,52 +63,14 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
 
 /// X86TargetMachine ctor - Create an X86 target.
 ///
-X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT,
-                                   const std::string &CPU,
-                                   const std::string &FS, bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
+                                   StringRef CPU, StringRef FS,
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   bool is64Bit)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS, StackAlignmentOverride, is64Bit),
     FrameLowering(*this, Subtarget),
     ELFWriterInfo(is64Bit, true) {
-  DefRelocModel = getRelocationModel();
-
-  // If no relocation model was picked, default as appropriate for the target.
-  if (getRelocationModel() == Reloc::Default) {
-    // Darwin defaults to PIC in 64 bit mode and dynamic-no-pic in 32 bit mode.
-    // Win64 requires rip-rel addressing, thus we force it to PIC. Otherwise we
-    // use static relocation model by default.
-    if (Subtarget.isTargetDarwin()) {
-      if (Subtarget.is64Bit())
-        setRelocationModel(Reloc::PIC_);
-      else
-        setRelocationModel(Reloc::DynamicNoPIC);
-    } else if (Subtarget.isTargetWin64())
-      setRelocationModel(Reloc::PIC_);
-    else
-      setRelocationModel(Reloc::Static);
-  }
-
-  assert(getRelocationModel() != Reloc::Default &&
-         "Relocation mode not picked");
-
-  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
-  // is defined as a model for code which may be used in static or dynamic
-  // executables but not necessarily a shared library. On X86-32 we just
-  // compile in -static mode, in x86-64 we use PIC.
-  if (getRelocationModel() == Reloc::DynamicNoPIC) {
-    if (is64Bit)
-      setRelocationModel(Reloc::PIC_);
-    else if (!Subtarget.isTargetDarwin())
-      setRelocationModel(Reloc::Static);
-  }
-
-  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
-  // the Mach-O file format doesn't support it.
-  if (getRelocationModel() == Reloc::Static &&
-      Subtarget.isTargetDarwin() &&
-      is64Bit)
-    setRelocationModel(Reloc::PIC_);
-
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
     // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
@@ -161,16 +91,20 @@ X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT,
     Subtarget.setPICStyle(PICStyles::GOT);
   }
 
-  // Finally, if we have "none" as our PIC style, force to static mode.
-  if (Subtarget.getPICStyle() == PICStyles::None)
-    setRelocationModel(Reloc::Static);
-
   // default to hard float ABI
   if (FloatABIType == FloatABI::Default)
     FloatABIType = FloatABI::Hard;    
 }
 
 //===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper",
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::init(false));
+
+//===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
@@ -200,46 +134,25 @@ bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
 
 bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel) {
-  if (OptLevel != CodeGenOpt::None && Subtarget.hasSSE2()) {
-    PM.add(createSSEDomainFixPass());
-    return true;
+  bool ShouldPrint = false;
+  if (OptLevel != CodeGenOpt::None &&
+      (Subtarget.hasSSE2() || Subtarget.hasAVX())) {
+    PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
+    ShouldPrint = true;
   }
-  return false;
+
+  if (Subtarget.hasAVX() && UseVZeroUpper) {
+    PM.add(createX86IssueVZeroUpperPass());
+    ShouldPrint = true;
+  }
+
+  return ShouldPrint;
 }
 
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
-  // FIXME: Move this to TargetJITInfo!
-  // On Darwin, do not override 64-bit setting made in X86TargetMachine().
-  if (DefRelocModel == Reloc::Default &&
-      (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit())) {
-    setRelocationModel(Reloc::Static);
-    Subtarget.setPICStyle(PICStyles::None);
-  }
-
-
   PM.add(createX86JITCodeEmitterPass(*this, JCE));
 
   return false;
 }
-
-void X86TargetMachine::setCodeModelForStatic() {
-
-    if (getCodeModel() != CodeModel::Default) return;
-
-    // For static codegen, if we're not already set, use Small codegen.
-    setCodeModel(CodeModel::Small);
-}
-
-
-void X86TargetMachine::setCodeModelForJIT() {
-
-  if (getCodeModel() != CodeModel::Default) return;
-
-  // 64-bit JIT places everything in the same buffer except external functions.
-  if (Subtarget.is64Bit())
-    setCodeModel(CodeModel::Large);
-  else
-    setCodeModel(CodeModel::Small);
-}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 885334a..d1569aa 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -29,21 +29,17 @@
 namespace llvm {
   
 class formatted_raw_ostream;
+class StringRef;
 
 class X86TargetMachine : public LLVMTargetMachine {
   X86Subtarget      Subtarget;
   X86FrameLowering  FrameLowering;
   X86ELFWriterInfo  ELFWriterInfo;
-  Reloc::Model      DefRelocModel; // Reloc model before it's overridden.
 
-private:
-  // We have specific defaults for X86.
-  virtual void setCodeModelForJIT();
-  virtual void setCodeModelForStatic();
-  
 public:
-  X86TargetMachine(const Target &T, const std::string &TT, 
-                   const std::string &CPU, const std::string &FS,
+  X86TargetMachine(const Target &T, StringRef TT, 
+                   StringRef CPU, StringRef FS,
+                   Reloc::Model RM, CodeModel::Model CM,
                    bool is64Bit);
 
   virtual const X86InstrInfo     *getInstrInfo() const {
@@ -87,8 +83,9 @@ class X86_32TargetMachine : public X86TargetMachine {
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
 public:
-  X86_32TargetMachine(const Target &T, const std::string &M,
-                      const std::string &CPU, const std::string &FS);
+  X86_32TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
@@ -113,8 +110,9 @@ class X86_64TargetMachine : public X86TargetMachine {
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
 public:
-  X86_64TargetMachine(const Target &T, const std::string &TT,
-                      const std::string &CPU, const std::string &FS);
+  X86_64TargetMachine(const Target &T, StringRef TT,
+                      StringRef CPU, StringRef FS,
+                      Reloc::Model RM, CodeModel::Model CM);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 1231798..991f322 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -43,79 +43,3 @@ getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
   return Mang->getSymbol(GV);
 }
-
-unsigned X8632_ELFTargetObjectFile::getPersonalityEncoding() const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8632_ELFTargetObjectFile::getLSDAEncoding() const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8632_ELFTargetObjectFile::getFDEEncoding(bool FDE) const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8632_ELFTargetObjectFile::getTTypeEncoding() const {
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-  else
-    return DW_EH_PE_absptr;
-}
-
-unsigned X8664_ELFTargetObjectFile::getPersonalityEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | (Model == CodeModel::Small ||
-                                                 Model == CodeModel::Medium ?
-                                            DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
-
-  if (Model == CodeModel::Small || Model == CodeModel::Medium)
-    return DW_EH_PE_udata4;
-
-  return DW_EH_PE_absptr;
-}
-
-unsigned X8664_ELFTargetObjectFile::getLSDAEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | (Model == CodeModel::Small ?
-                             DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
-
-  if (Model == CodeModel::Small)
-    return DW_EH_PE_udata4;
-
-  return DW_EH_PE_absptr;
-}
-
-unsigned X8664_ELFTargetObjectFile::getFDEEncoding(bool CFI) const {
-  if (CFI)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_pcrel | DW_EH_PE_sdata4;
-
-  return DW_EH_PE_udata4;
-}
-
-unsigned X8664_ELFTargetObjectFile::getTTypeEncoding() const {
-  CodeModel::Model Model = TM.getCodeModel();
-  if (TM.getRelocationModel() == Reloc::PIC_)
-    return DW_EH_PE_indirect | DW_EH_PE_pcrel | (Model == CodeModel::Small ||
-                                                 Model == CodeModel::Medium ?
-                                            DW_EH_PE_sdata4 : DW_EH_PE_sdata8);
-
-  if (Model == CodeModel::Small)
-    return DW_EH_PE_udata4;
-
-  return DW_EH_PE_absptr;
-}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index e21b5bf..d7adf27 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -33,28 +33,6 @@ namespace llvm {
                             MachineModuleInfo *MMI) const;
   };
 
-  class X8632_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-    const X86TargetMachine &TM;
-  public:
-    X8632_ELFTargetObjectFile(const X86TargetMachine &tm)
-      :TM(tm) { }
-    virtual unsigned getPersonalityEncoding() const;
-    virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding(bool CFI) const;
-    virtual unsigned getTTypeEncoding() const;
-  };
-
-  class X8664_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-    const X86TargetMachine &TM;
-  public:
-    X8664_ELFTargetObjectFile(const X86TargetMachine &tm)
-      :TM(tm) { }
-    virtual unsigned getPersonalityEncoding() const;
-    virtual unsigned getLSDAEncoding() const;
-    virtual unsigned getFDEEncoding(bool CFI) const;
-    virtual unsigned getTTypeEncoding() const;
-  };
-
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 0000000..3958494
--- /dev/null
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,105 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when tranfering control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+  struct VZeroUpperInserter : public MachineFunctionPass {
+    static char ID;
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    MachineBasicBlock *MBB;     // Current basic block
+  };
+  char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+  return new VZeroUpperInserter();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzero upper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  bool Changed = false;
+
+  // Process any unreachable blocks in arbitrary order now.
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    Changed |= processBasicBlock(MF, *BB);
+
+  return Changed;
+}
+
+static bool isCallToModuleFn(const MachineInstr *MI) {
+  assert(MI->getDesc().isCall() && "Isn't a call instruction");
+
+  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    if (!MO.isGlobal())
+      continue;
+
+    const GlobalValue *GV = MO.getGlobal();
+    GlobalValue::LinkageTypes LT = GV->getLinkage();
+    if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) ||
+        (GV->isExternalLinkage(LT) && !GV->isDeclaration()))
+      return true;
+
+    return false;
+  }
+  return false;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzero upper instructions before function calls.
+bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+                                           MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    DebugLoc dl = I->getDebugLoc();
+
+    // Insert a vzeroupper instruction before each control transfer
+    // to functions outside this module
+    if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) {
+      BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
+      ++NumVZU;
+    }
+  }
+
+  return Changed;
+}
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index a1d73c6..3dc51e1 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -1,11 +1,12 @@
 set(LLVM_TARGET_DEFINITIONS XCore.td)
 
-tablegen(XCoreGenRegisterInfo.inc -gen-register-info)
-tablegen(XCoreGenInstrInfo.inc -gen-instr-info)
-tablegen(XCoreGenAsmWriter.inc -gen-asm-writer)
-tablegen(XCoreGenDAGISel.inc -gen-dag-isel)
-tablegen(XCoreGenCallingConv.inc -gen-callingconv)
-tablegen(XCoreGenSubtargetInfo.inc -gen-subtarget)
+llvm_tablegen(XCoreGenRegisterInfo.inc -gen-register-info)
+llvm_tablegen(XCoreGenInstrInfo.inc -gen-instr-info)
+llvm_tablegen(XCoreGenAsmWriter.inc -gen-asm-writer)
+llvm_tablegen(XCoreGenDAGISel.inc -gen-dag-isel)
+llvm_tablegen(XCoreGenCallingConv.inc -gen-callingconv)
+llvm_tablegen(XCoreGenSubtargetInfo.inc -gen-subtarget)
+add_public_tablegen_target(XCoreCommonTableGen)
 
 add_llvm_target(XCoreCodeGen
   XCoreAsmPrinter.cpp
@@ -20,5 +21,17 @@ add_llvm_target(XCoreCodeGen
   XCoreSelectionDAGInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMXCoreCodeGen
+  LLVMAsmPrinter
+  LLVMCodeGen
+  LLVMCore
+  LLVMMC
+  LLVMSelectionDAG
+  LLVMSupport
+  LLVMTarget
+  LLVMXCoreDesc
+  LLVMXCoreInfo
+  )
+
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
index c3b3dc9..269822d 100644
--- a/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/XCore/MCTargetDesc/CMakeLists.txt
@@ -3,5 +3,12 @@ add_llvm_library(LLVMXCoreDesc
   XCoreMCAsmInfo.cpp
   )
 
+add_llvm_library_dependencies(LLVMXCoreDesc
+  LLVMMC
+  LLVMXCoreInfo
+  )
+
+add_dependencies(LLVMXCoreDesc XCoreCommonTableGen)
+
 # Hack: we need to include 'main' target directory to grab private headers
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 939d97c..276e841 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -13,10 +13,11 @@
 
 #include "XCoreMCTargetDesc.h"
 #include "XCoreMCAsmInfo.h"
+#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
 #include "XCoreGenInstrInfo.inc"
@@ -35,8 +36,10 @@ static MCInstrInfo *createXCoreMCInstrInfo() {
   return X;
 }
 
-extern "C" void LLVMInitializeXCoreMCInstrInfo() {
-  TargetRegistry::RegisterMCInstrInfo(TheXCoreTarget, createXCoreMCInstrInfo);
+static MCRegisterInfo *createXCoreMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitXCoreMCRegisterInfo(X, XCore::LR);
+  return X;
 }
 
 static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
@@ -46,11 +49,40 @@ static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-extern "C" void LLVMInitializeXCoreMCSubtargetInfo() {
-  TargetRegistry::RegisterMCSubtargetInfo(TheXCoreTarget,
-                                          createXCoreMCSubtargetInfo);
+static MCAsmInfo *createXCoreMCAsmInfo(const Target &T, StringRef TT) {
+  MCAsmInfo *MAI = new XCoreMCAsmInfo(T, TT);
+
+  // Initial state of the frame pointer is SP.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(XCore::SP, 0);
+  MAI->addInitialFrameState(0, Dst, Src);
+
+  return MAI;
+}
+
+static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM);
+  return X;
 }
 
-extern "C" void LLVMInitializeXCoreMCAsmInfo() {
-  RegisterMCAsmInfo<XCoreMCAsmInfo> X(TheXCoreTarget);
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTargetMC() {
+  // Register the MC asm info.
+  RegisterMCAsmInfoFn X(TheXCoreTarget, createXCoreMCAsmInfo);
+
+  // Register the MC codegen info.
+  TargetRegistry::RegisterMCCodeGenInfo(TheXCoreTarget,
+                                        createXCoreMCCodeGenInfo);
+
+  // Register the MC instruction info.
+  TargetRegistry::RegisterMCInstrInfo(TheXCoreTarget, createXCoreMCInstrInfo);
+
+  // Register the MC register info.
+  TargetRegistry::RegisterMCRegInfo(TheXCoreTarget, createXCoreMCRegisterInfo);
+
+  // Register the MC subtarget info.
+  TargetRegistry::RegisterMCSubtargetInfo(TheXCoreTarget,
+                                          createXCoreMCSubtargetInfo);
 }
diff --git a/lib/Target/XCore/TargetInfo/CMakeLists.txt b/lib/Target/XCore/TargetInfo/CMakeLists.txt
index c147b8a..7f84f69 100644
--- a/lib/Target/XCore/TargetInfo/CMakeLists.txt
+++ b/lib/Target/XCore/TargetInfo/CMakeLists.txt
@@ -4,4 +4,10 @@ add_llvm_library(LLVMXCoreInfo
   XCoreTargetInfo.cpp
   )
 
-add_dependencies(LLVMXCoreInfo XCoreCodeGenTable_gen)
+add_llvm_library_dependencies(LLVMXCoreInfo
+  LLVMMC
+  LLVMSupport
+  LLVMTarget
+  )
+
+add_dependencies(LLVMXCoreInfo XCoreCommonTableGen)
diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
index 7aa8965..9a0971d 100644
--- a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
+++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
@@ -9,7 +9,7 @@
 
 #include "XCore.h"
 #include "llvm/Module.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 Target llvm::TheXCoreTarget;
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 1a43714..8906b24 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
+#include "llvm/Analysis/DebugInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -32,11 +33,11 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cctype>
@@ -51,6 +52,7 @@ static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
 namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
+    void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
   public:
     explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()){}
@@ -79,6 +81,7 @@ namespace {
     void EmitFunctionEntryLabel();
     void EmitInstruction(const MachineInstr *MI);
     void EmitFunctionBodyEnd();
+    virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   };
 } // end of anonymous namespace
 
@@ -88,7 +91,7 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
   assert(((GV->hasExternalLinkage() ||
     GV->hasWeakLinkage()) ||
     GV->hasLinkOnceLinkage()) && "Unexpected linkage");
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(
+  if (ArrayType *ATy = dyn_cast<ArrayType>(
     cast<PointerType>(GV->getType())->getElementType())) {
     OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
     // FIXME: MCStreamerize.
@@ -261,16 +264,57 @@ bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
   return false;
 }
 
+void XCoreAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                             raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps-2, OS);
+}
+
+MachineLocation XCoreAsmPrinter::
+getDebugValueLocation(const MachineInstr *MI) const {
+  // Handles frame addresses emitted in XCoreInstrInfo::emitFrameIndexDebugValue.
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
+         "Unexpected MachineOperand types");
+  return MachineLocation(MI->getOperand(0).getReg(),
+                         MI->getOperand(1).getImm());
+}
+
 void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
 
-  // Check for mov mnemonic
-  if (MI->getOpcode() == XCore::ADD_2rus && !MI->getOperand(2).getImm())
-    O << "\tmov " << getRegisterName(MI->getOperand(0).getReg()) << ", "
-      << getRegisterName(MI->getOperand(1).getReg());
-  else
-    printInstruction(MI, O);
+  switch (MI->getOpcode()) {
+  case XCore::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
+  case XCore::ADD_2rus:
+    if (MI->getOperand(2).getImm() == 0) {
+      O << "\tmov " << getRegisterName(MI->getOperand(0).getReg()) << ", "
+        << getRegisterName(MI->getOperand(1).getReg());
+      OutStreamer.EmitRawText(O.str());
+      return;
+    }
+    break;
+  }
+  printInstruction(MI, O);
   OutStreamer.EmitRawText(O.str());
 }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 0578220..7f8b169 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -100,7 +100,8 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   bool FP = hasFP(MF);
-  bool Nested = MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::Nest);
+  bool Nested = MF.getFunction()->
+                getAttributes().hasAttrSomewhere(Attribute::Nest);
 
   if (Nested) {
     loadFromStack(MBB, MBBI, XCore::R11, 0, dl, TII);
@@ -270,14 +271,6 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-void XCoreFrameLowering::getInitialFrameState(std::vector<MachineMove> &Moves)
-                                                                        const {
-  // Initial state of the frame pointer is SP.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(XCore::SP, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
 bool XCoreFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 7da19f0..c591e93 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -42,8 +42,6 @@ namespace llvm {
 
     bool hasFP(const MachineFunction &MF) const;
 
-    void getInitialFrameState(std::vector<MachineMove> &Moves) const;
-
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                               RegScavenger *RS = NULL) const;
 
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index a8dd847..4dac1ce 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -169,9 +169,14 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
         CurDAG->getTargetConstantPool(ConstantInt::get(
                               Type::getInt32Ty(*CurDAG->getContext()), Val),
                                       TLI.getPointerTy());
-      return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, 
-                                    MVT::Other, CPIdx, 
-                                    CurDAG->getEntryNode());
+      SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
+                                            MVT::Other, CPIdx,
+                                            CurDAG->getEntryNode());
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = MF->getMachineMemOperand(
+        MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4);      
+      cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1);
+      return node;
     }
     break;
   }
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 6d040e0..2afe0e3 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -81,6 +81,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   // Use i32 for setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
+  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
 
   // XCore does not have the NodeTypes below.
   setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
@@ -147,7 +148,8 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
   // TRAMPOLINE is custom lowered.
-  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+  setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   maxStoresPerMemset = maxStoresPerMemsetOptSize = 4;
   maxStoresPerMemmove = maxStoresPerMemmoveOptSize
@@ -180,7 +182,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADD:
   case ISD::SUB:              return ExpandADDSUB(Op.getNode(), DAG);
   case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
-  case ISD::TRAMPOLINE:       return LowerTRAMPOLINE(Op, DAG);
+  case ISD::INIT_TRAMPOLINE:  return LowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
     return SDValue();
@@ -252,8 +255,8 @@ static inline SDValue BuildGetId(SelectionDAG &DAG, DebugLoc dl) {
                      DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
 }
 
-static inline bool isZeroLengthArray(const Type *Ty) {
-  const ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
+static inline bool isZeroLengthArray(Type *Ty) {
+  ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
   return AT && (AT->getNumElements() == 0);
 }
 
@@ -275,7 +278,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     llvm_unreachable("Thread local object not a GlobalVariable?");
     return SDValue();
   }
-  const Type *Ty = cast<PointerType>(GV->getType())->getElementType();
+  Type *Ty = cast<PointerType>(GV->getType())->getElementType();
   if (!Ty->isSized() || isZeroLengthArray(Ty)) {
 #ifndef NDEBUG
     errs() << "Size of thread local object " << GVar->getName()
@@ -465,7 +468,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
-  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -524,7 +527,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   }
 
   // Lower to a call to __misaligned_store(BasePtr, Value).
-  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -789,7 +792,12 @@ SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
 }
 
 SDValue XCoreTargetLowering::
-LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
+  return Op.getOperand(0);
+}
+
+SDValue XCoreTargetLowering::
+LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
@@ -841,9 +849,7 @@ LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
                               MachinePointerInfo(TrmpAddr, 16), false, false,
                               0);
 
-  SDValue Ops[] =
-    { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5) };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1148,10 +1154,10 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       int offset = 0;
       // Save remaining registers, storing higher register numbers at a higher
       // address
-      for (unsigned i = array_lengthof(ArgRegs) - 1; i >= FirstVAReg; --i) {
+      for (int i = array_lengthof(ArgRegs) - 1; i >= (int)FirstVAReg; --i) {
         // Create a stack slot
         int FI = MFI->CreateFixedObject(4, offset, true);
-        if (i == FirstVAReg) {
+        if (i == (int)FirstVAReg) {
           XFI->setVarArgsFrameIndex(FI);
         }
         offset -= StackSlotSize;
@@ -1409,7 +1415,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     // operands are constant canonicalize smallest to RHS.
     if ((N0C && !N1C) ||
         (N0C && N1C && N0C->getZExtValue() < N1C->getZExtValue()))
-      return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT), N1, N0, N2, N3);
+      return DAG.getNode(XCoreISD::LMUL, dl, DAG.getVTList(VT, VT),
+                         N1, N0, N2, N3);
 
     // lmul(x, 0, a, b)
     if (N1C && N1C->isNullValue()) {
@@ -1548,7 +1555,7 @@ static inline bool isImmUs4(int64_t val)
 /// by AM is legal for this target, for a load/store of the specified type.
 bool
 XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                              const Type *Ty) const {
+                                              Type *Ty) const {
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 9c803be..d6c5b32 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -101,7 +101,7 @@ namespace llvm {
                                   MachineBasicBlock *MBB) const;
 
     virtual bool isLegalAddressingMode(const AddrMode &AM,
-                                       const Type *Ty) const;
+                                       Type *Ty) const;
 
   private:
     const XCoreTargetMachine &TM;
@@ -145,7 +145,8 @@ namespace llvm {
     SDValue LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index f90481f..a0946a1 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -17,11 +17,10 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
-#include "llvm/Target/TargetRegistry.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_CTOR
 #include "XCoreGenInstrInfo.inc"
@@ -387,6 +386,15 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     .addImm(0);
 }
 
+MachineInstr*
+XCoreInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(XCore::DBG_VALUE))
+    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
+  return &*MIB;
+}
+
 /// ReverseBranchCondition - Return the inverse opcode of the 
 /// specified Branch instruction.
 bool XCoreInstrInfo::
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 840b1e1..d354802 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -78,6 +78,11 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
+  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
+                                                 int FrameIx,
+                                                 uint64_t Offset,
+                                                 const MDNode *MDPtr,
+                                                 DebugLoc DL) const;
 
   virtual bool ReverseBranchCondition(
                             SmallVectorImpl<MachineOperand> &Cond) const;
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 55c7527..4d2e93b 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -572,7 +572,7 @@ def STWDP_lru6 : _FLRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
                   [(store GRRegs:$val, ADDRdpii:$addr)]>;
 
 //let Uses = [CP] in ..
-let mayLoad = 1, isReMaterializable = 1 in
+let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in
 defm LDWCP : FRU6_LRU6_cp<"ldw">;
 
 let Uses = [SP] in {
@@ -739,7 +739,7 @@ def LDAP_lu10_ba : _FLU10<(outs),
 
 let isCall=1,
 // All calls clobber the link register and the non-callee-saved registers:
-Defs = [R0, R1, R2, R3, R11, LR] in {
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
 def BL_u10 : _FU10<
                   (outs),
                   (ins calltarget:$target, variable_ops),
@@ -754,7 +754,7 @@ def BL_lu10 : _FLU10<
 }
 
 // Two operand short
-// TODO eet, eef, testwct, tsetmr, sext (reg), zext (reg)
+// TODO eet, eef, tsetmr
 def NOT : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
                  "not $dst, $b",
                  [(set GRRegs:$dst, (not GRRegs:$b))]>;
@@ -764,15 +764,25 @@ def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
                  [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
 
 let Constraints = "$src1 = $dst" in {
-let neverHasSideEffects = 1 in
 def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
-                 "sext $dst, $src2",
-                 []>;
+                      "sext $dst, $src2",
+                      [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                         immBitp:$src2))]>;
+
+def SEXT_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+                     "sext $dst, $src2",
+                     [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                        GRRegs:$src2))]>;
 
-let neverHasSideEffects = 1 in
 def ZEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
-                 "zext $dst, $src2",
-                 []>;
+                      "zext $dst, $src2",
+                      [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                         immBitp:$src2))]>;
+
+def ZEXT_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+                     "zext $dst, $src2",
+                     [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                        GRRegs:$src2))]>;
 
 def ANDNOT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
                  "andnot $dst, $src2",
@@ -819,7 +829,8 @@ def OUT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
 let Constraints = "$src = $dst" in
 def OUTSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
                  "outshr res[$r], $src",
-                 [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
+                 [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r,
+                                                      GRRegs:$src))]>;
 
 def INCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
                  "inct $dst, res[$r]",
@@ -836,7 +847,8 @@ def IN_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
 let Constraints = "$src = $dst" in
 def INSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
                  "inshr $dst, res[$r]",
-                 [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
+                 [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r,
+                                                     GRRegs:$src))]>;
 
 def CHKCT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "chkct res[$r], $val",
@@ -846,6 +858,14 @@ def CHKCT_rus : _F2R<(outs), (ins GRRegs:$r, i32imm:$val),
                  "chkct res[$r], $val",
                  [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
 
+def TESTCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                     "testct $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_testct GRRegs:$src))]>;
+
+def TESTWCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                      "testwct $dst, res[$src]",
+                      [(set GRRegs:$dst, (int_xcore_testwct GRRegs:$src))]>;
+
 def SETD_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
                  "setd res[$r], $val",
                  [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
@@ -871,7 +891,6 @@ def INITDP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
                      [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
 
 // Two operand long
-// TODO endin, peek,
 // getd, testlcl
 def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
                  "bitrev $dst, $src",
@@ -917,6 +936,14 @@ def SETPSC_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
                        "setpsc res[$src1], $src2",
                        [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
 
+def PEEK_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                      "peek $dst, res[$src]",
+                      [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+
+def ENDIN_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                       "endin $dst, res[$src]",
+                       [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+
 // One operand short
 // TODO edu, eeu, waitet, waitef, tstart, clrtp
 // setdp, setcp, setev, kcall
@@ -960,7 +987,7 @@ def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
 
 let isCall=1, 
 // All calls clobber the link register and the non-callee-saved registers:
-Defs = [R0, R1, R2, R3, R11, LR] in {
+Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
 def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops),
                  "bla $addr",
                  [(XCoreBranchLink GRRegs:$addr)]>;
@@ -974,10 +1001,15 @@ def FREER_1r : _F1R<(outs), (ins GRRegs:$r),
                "freer res[$r]",
                [(int_xcore_freer GRRegs:$r)]>;
 
-let Uses=[R11] in
+let Uses=[R11] in {
 def SETV_1r : _F1R<(outs), (ins GRRegs:$r),
-               "setv res[$r], r11",
-               [(int_xcore_setv GRRegs:$r, R11)]>;
+                   "setv res[$r], r11",
+                   [(int_xcore_setv GRRegs:$r, R11)]>;
+
+def SETEV_1r : _F1R<(outs), (ins GRRegs:$r),
+                    "setev res[$r], r11",
+                    [(int_xcore_setev GRRegs:$r, R11)]>;
+}
 
 def EEU_1r : _F1R<(outs), (ins GRRegs:$r),
                "eeu res[$r]",
@@ -985,15 +1017,24 @@ def EEU_1r : _F1R<(outs), (ins GRRegs:$r),
 
 // Zero operand short
 // TODO freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
-// stet, geted, getet, getkep, getksp, setkep, getid, kret, dcall, dret,
+// stet, getkep, getksp, setkep, getid, kret, dcall, dret,
 // dentsp, drestsp
 
 def CLRE_0R : _F0R<(outs), (ins), "clre", [(int_xcore_clre)]>;
 
-let Defs = [R11] in
+let Defs = [R11] in {
 def GETID_0R : _F0R<(outs), (ins),
-                 "get r11, id",
-                 [(set R11, (int_xcore_getid))]>;
+                    "get r11, id",
+                    [(set R11, (int_xcore_getid))]>;
+
+def GETED_0R : _F0R<(outs), (ins),
+                    "get r11, ed",
+                    [(set R11, (int_xcore_geted))]>;
+
+def GETET_0R : _F0R<(outs), (ins),
+                    "get r11, et",
+                    [(set R11, (int_xcore_getet))]>;
+}
 
 def SSYNC_0r : _F0R<(outs), (ins),
                     "ssync",
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 357a4a0..1b78b37 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineLocation.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
@@ -39,7 +38,7 @@
 using namespace llvm;
 
 XCoreRegisterInfo::XCoreRegisterInfo(const TargetInstrInfo &tii)
-  : XCoreGenRegisterInfo(), TII(tii) {
+  : XCoreGenRegisterInfo(XCore::LR), TII(tii) {
 }
 
 // helper functions
@@ -321,20 +320,8 @@ loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
 }
 
-int XCoreRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return XCoreGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
-}
-
-int XCoreRegisterInfo::getLLVMRegNum(unsigned DwarfRegNo, bool isEH) const {
-  return XCoreGenRegisterInfo::getLLVMRegNumFull(DwarfRegNo,0);
-}
-
 unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
   return TFI->hasFP(MF) ? XCore::R10 : XCore::SP;
 }
-
-unsigned XCoreRegisterInfo::getRARegister() const {
-  return XCore::LR;
-}
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 801d9eb..5c28f39 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -60,7 +60,6 @@ public:
                            int SPAdj, RegScavenger *RS = NULL) const;
 
   // Debug information queries.
-  unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
   //! Return the array of argument passing registers
@@ -74,10 +73,6 @@ public:
   
   //! Return whether to emit frame moves
   static bool needsFrameMoves(const MachineFunction &MF);
-
-  //! Get DWARF debugging register number
-  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  int getLLVMRegNum(unsigned RegNum, bool isEH) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index ad069bf..b4e9927 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -13,7 +13,7 @@
 
 #include "XCoreSubtarget.h"
 #include "XCore.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 342966a..fdc5d35 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -14,15 +14,15 @@
 #include "XCore.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 /// XCoreTargetMachine ctor - Create an ILP32 architecture model
 ///
-XCoreTargetMachine::XCoreTargetMachine(const Target &T, const std::string &TT,
-                                       const std::string &CPU,
-                                       const std::string &FS)
-  : LLVMTargetMachine(T, TT, CPU, FS),
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
+                                       StringRef CPU, StringRef FS,
+                                       Reloc::Model RM, CodeModel::Model CM)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
     Subtarget(TT, CPU, FS),
     DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
                "i16:16:32-i32:32:32-i64:32:32-n32"),
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 6235ac3..83d09d6d 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -32,8 +32,9 @@ class XCoreTargetMachine : public LLVMTargetMachine {
   XCoreTargetLowering TLInfo;
   XCoreSelectionDAGInfo TSInfo;
 public:
-  XCoreTargetMachine(const Target &T, const std::string &TT,
-                     const std::string &CPU, const std::string &FS);
+  XCoreTargetMachine(const Target &T, StringRef TT,
+                     StringRef CPU, StringRef FS,
+                     Reloc::Model RM, CodeModel::Model CM);
 
   virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const XCoreFrameLowering *getFrameLowering() const {
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index fa007cf..e160f63 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -155,12 +155,12 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   for (unsigned i = 0; i != PointerArgs.size(); ++i) {
     bool isByVal = F->paramHasAttr(PointerArgs[i].second+1, Attribute::ByVal);
     Argument *PtrArg = PointerArgs[i].first;
-    const Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
     // If this is a byval argument, and if the aggregate type is small, just
     // pass the elements, which is always safe.
     if (isByVal) {
-      if (const StructType *STy = dyn_cast<StructType>(AgTy)) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (maxElements > 0 && STy->getNumElements() > maxElements) {
           DEBUG(dbgs() << "argpromotion disable promoting argument '"
                 << PtrArg->getName() << "' because it would require adding more"
@@ -190,7 +190,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
     // If the argument is a recursive type and we're in a recursive
     // function, we could end up infinitely peeling the function argument.
     if (isSelfRecursive) {
-      if (const StructType *STy = dyn_cast<StructType>(AgTy)) {
+      if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         bool RecursiveType = false;
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           if (STy->getElementType(i) == PtrArg->getType()) {
@@ -382,7 +382,8 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
     User *U = *UI;
     Operands.clear();
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (LI->isVolatile()) return false;  // Don't hack volatile loads
+      // Don't hack volatile/atomic loads
+      if (!LI->isSimple()) return false;
       Loads.push_back(LI);
       // Direct loads are equivalent to a GEP with a zero index and then a load.
       Operands.push_back(0);
@@ -410,7 +411,8 @@ bool ArgPromotion::isSafeToPromoteArgument(Argument *Arg, bool isByVal) const {
       for (Value::use_iterator UI = GEP->use_begin(), E = GEP->use_end();
            UI != E; ++UI)
         if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
-          if (LI->isVolatile()) return false;  // Don't hack volatile loads
+          // Don't hack volatile/atomic loads
+          if (!LI->isSimple()) return false;
           Loads.push_back(LI);
         } else {
           // Other uses than load?
@@ -492,7 +494,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
   std::vector<Type*> Params;
 
   typedef std::set<IndicesVector> ScalarizeTable;
@@ -527,8 +529,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
        ++I, ++ArgIndex) {
     if (ByValArgsToTransform.count(I)) {
       // Simple byval argument? Just add all the struct element types.
-      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      const StructType *STy = cast<StructType>(AgTy);
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      StructType *STy = cast<StructType>(AgTy);
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         Params.push_back(STy->getElementType(i));
       ++NumByValArgsPromoted;
@@ -576,9 +578,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       for (ScalarizeTable::iterator SI = ArgIndices.begin(),
              E = ArgIndices.end(); SI != E; ++SI) {
         // not allowed to dereference ->begin() if size() is 0
-        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(),
-                                                           SI->begin(),
-                                                           SI->end()));
+        Params.push_back(GetElementPtrInst::getIndexedType(I->getType(), *SI));
         assert(Params.back());
       }
 
@@ -593,7 +593,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   if (Attributes attrs = PAL.getFnAttributes())
     AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
 
-  const Type *RetTy = FTy->getReturnType();
+  Type *RetTy = FTy->getReturnType();
 
   // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
   // have zero fixed arguments.
@@ -662,13 +662,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
 
       } else if (ByValArgsToTransform.count(I)) {
         // Emit a GEP and load for each element of the struct.
-        const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-        const StructType *STy = cast<StructType>(AgTy);
+        Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+        StructType *STy = cast<StructType>(AgTy);
         Value *Idxs[2] = {
               ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
-          Value *Idx = GetElementPtrInst::Create(*AI, Idxs, Idxs+2,
+          Value *Idx = GetElementPtrInst::Create(*AI, Idxs,
                                                  (*AI)->getName()+"."+utostr(i),
                                                  Call);
           // TODO: Tell AA about the new values?
@@ -686,12 +686,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
           LoadInst *OrigLoad = OriginalLoads[*SI];
           if (!SI->empty()) {
             Ops.reserve(SI->size());
-            const Type *ElTy = V->getType();
+            Type *ElTy = V->getType();
             for (IndicesVector::const_iterator II = SI->begin(),
                  IE = SI->end(); II != IE; ++II) {
               // Use i32 to index structs, and i64 for others (pointers/arrays).
               // This satisfies GEP constraints.
-              const Type *IdxTy = (ElTy->isStructTy() ?
+              Type *IdxTy = (ElTy->isStructTy() ?
                     Type::getInt32Ty(F->getContext()) : 
                     Type::getInt64Ty(F->getContext()));
               Ops.push_back(ConstantInt::get(IdxTy, *II));
@@ -699,8 +699,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
               ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(*II);
             }
             // And create a GEP to extract those indices.
-            V = GetElementPtrInst::Create(V, Ops.begin(), Ops.end(),
-                                          V->getName()+".idx", Call);
+            V = GetElementPtrInst::Create(V, Ops, V->getName()+".idx", Call);
             Ops.clear();
             AA.copyValue(OrigLoad->getOperand(0), V);
           }
@@ -792,16 +791,16 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       Instruction *InsertPt = NF->begin()->begin();
 
       // Just add all the struct element types.
-      const Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+      Type *AgTy = cast<PointerType>(I->getType())->getElementType();
       Value *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt);
-      const StructType *STy = cast<StructType>(AgTy);
+      StructType *STy = cast<StructType>(AgTy);
       Value *Idxs[2] = {
             ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), 0 };
 
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
         Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
         Value *Idx = 
-          GetElementPtrInst::Create(TheAlloca, Idxs, Idxs+2,
+          GetElementPtrInst::Create(TheAlloca, Idxs,
                                     TheAlloca->getName()+"."+Twine(i), 
                                     InsertPt);
         I2->setName(I->getName()+"."+Twine(i));
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 3de7bfc..4d8dbc2 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -13,10 +13,20 @@ add_llvm_library(LLVMipo
   Inliner.cpp
   Internalize.cpp
   LoopExtractor.cpp
-  LowerSetJmp.cpp
   MergeFunctions.cpp
   PartialInlining.cpp
+  PassManagerBuilder.cpp
   PruneEH.cpp
   StripDeadPrototypes.cpp
   StripSymbols.cpp
   )
+
+add_llvm_library_dependencies(LLVMipo
+  LLVMAnalysis
+  LLVMCore
+  LLVMScalarOpts
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  LLVMipa
+  )
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index a21efce..c3ecb7a 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -23,7 +23,9 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -37,10 +39,18 @@ namespace {
       initializeConstantMergePass(*PassRegistry::getPassRegistry());
     }
 
-    // run - For this pass, process all of the globals in the module,
-    // eliminating duplicate constants.
-    //
+    // For this pass, process all of the globals in the module, eliminating
+    // duplicate constants.
     bool runOnModule(Module &M);
+
+    // Return true iff we can determine the alignment of this global variable.
+    bool hasKnownAlignment(GlobalVariable *GV) const;
+
+    // Return the alignment of the global, including converting the default
+    // alignment to a concrete value.
+    unsigned getAlignment(GlobalVariable *GV) const;
+
+    const TargetData *TD;
   };
 }
 
@@ -77,15 +87,28 @@ static bool IsBetterCannonical(const GlobalVariable &A,
   return A.hasUnnamedAddr();
 }
 
+bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const {
+  return TD || GV->getAlignment() != 0;
+}
+
+unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
+  if (TD)
+    return TD->getPreferredAlignment(GV);
+  return GV->getAlignment();
+}
+
 bool ConstantMerge::runOnModule(Module &M) {
+  TD = getAnalysisIfAvailable<TargetData>();
+
   // Find all the globals that are marked "used".  These cannot be merged.
   SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
   FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
   FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
   
-  // Map unique constant/section pairs to globals.  We don't want to merge
-  // globals in different sections.
-  DenseMap<Constant*, GlobalVariable*> CMap;
+  // Map unique <constants, has-unknown-alignment> pairs to globals.  We don't
+  // want to merge globals of unknown alignment with those of explicit
+  // alignment.  If we have TargetData, we always know the alignment.
+  DenseMap<PointerIntPair<Constant*, 1, bool>, GlobalVariable*> CMap;
 
   // Replacements - This vector contains a list of replacements to perform.
   SmallVector<std::pair<GlobalVariable*, GlobalVariable*>, 32> Replacements;
@@ -120,7 +143,8 @@ bool ConstantMerge::runOnModule(Module &M) {
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
-      GlobalVariable *&Slot = CMap[Init];
+      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
+      GlobalVariable *&Slot = CMap[Pair];
 
       // If this is the first constant we find or if the old on is local,
       // replace with the current one. It the current is externally visible
@@ -152,7 +176,8 @@ bool ConstantMerge::runOnModule(Module &M) {
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
-      GlobalVariable *Slot = CMap[Init];
+      PointerIntPair<Constant*, 1, bool> Pair(Init, hasKnownAlignment(GV));
+      GlobalVariable *Slot = CMap[Pair];
 
       if (!Slot || Slot == GV)
         continue;
@@ -175,6 +200,14 @@ bool ConstantMerge::runOnModule(Module &M) {
     // now.  This avoid invalidating the pointers in CMap, which are unneeded
     // now.
     for (unsigned i = 0, e = Replacements.size(); i != e; ++i) {
+      // Bump the alignment if necessary.
+      if (Replacements[i].first->getAlignment() ||
+          Replacements[i].second->getAlignment()) {
+        Replacements[i].second->setAlignment(std::max(
+            Replacements[i].first->getAlignment(),
+            Replacements[i].second->getAlignment()));
+      }
+
       // Eliminate any uses of the dead global.
       Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
 
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 1517765..4bb6f7a 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -206,7 +206,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but doesn't have isVarArg set.
-  const FunctionType *FTy = Fn.getFunctionType();
+  FunctionType *FTy = Fn.getFunctionType();
 
   std::vector<Type*> Params(FTy->param_begin(), FTy->param_end());
   FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
@@ -344,7 +344,7 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
 static unsigned NumRetVals(const Function *F) {
   if (F->getReturnType()->isVoidTy())
     return 0;
-  else if (const StructType *STy = dyn_cast<StructType>(F->getReturnType()))
+  else if (StructType *STy = dyn_cast<StructType>(F->getReturnType()))
     return STy->getNumElements();
   else
     return 1;
@@ -491,7 +491,7 @@ void DAE::SurveyFunction(const Function &F) {
   // Keep track of the number of live retvals, so we can skip checks once all
   // of them turn out to be live.
   unsigned NumLiveRetVals = 0;
-  const Type *STy = dyn_cast<StructType>(F.getReturnType());
+  Type *STy = dyn_cast<StructType>(F.getReturnType());
   // Loop all uses of the function.
   for (Value::const_use_iterator I = F.use_begin(), E = F.use_end();
        I != E; ++I) {
@@ -646,7 +646,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has fewer arguments and a different return type.
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
   std::vector<Type*> Params;
 
   // Set up to build a new list of parameter attributes.
@@ -660,7 +660,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // Find out the new return value.
 
   Type *RetTy = FTy->getReturnType();
-  const Type *NRetTy = NULL;
+  Type *NRetTy = NULL;
   unsigned RetCount = NumRetVals(F);
 
   // -1 means unused, other numbers are the new index
@@ -669,7 +669,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   if (RetTy->isVoidTy()) {
     NRetTy = RetTy;
   } else {
-    const StructType *STy = dyn_cast<StructType>(RetTy);
+    StructType *STy = dyn_cast<StructType>(RetTy);
     if (STy)
       // Look at each of the original return values individually.
       for (unsigned i = 0; i != RetCount; ++i) {
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 95decec..0edf342 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -163,14 +163,14 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
           ReadsMemory = true;
         continue;
       } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        // Ignore non-volatile loads from local memory.
+        // Ignore non-volatile loads from local memory. (Atomic is okay here.)
         if (!LI->isVolatile()) {
           AliasAnalysis::Location Loc = AA->getLocation(LI);
           if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
             continue;
         }
       } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // Ignore non-volatile stores to local memory.
+        // Ignore non-volatile stores to local memory. (Atomic is okay here.)
         if (!SI->isVolatile()) {
           AliasAnalysis::Location Loc = AA->getLocation(SI);
           if (AA->pointsToConstantMemory(Loc, /*OrLocal=*/true))
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 4ac721d..3552d03 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -195,12 +195,14 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
       }
       if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
         GS.isLoaded = true;
-        if (LI->isVolatile()) return true;  // Don't hack on volatile loads.
+        // Don't hack on volatile/atomic loads.
+        if (!LI->isSimple()) return true;
       } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // Don't allow a store OF the address, only stores TO the address.
         if (SI->getOperand(0) == V) return true;
 
-        if (SI->isVolatile()) return true;  // Don't hack on volatile stores.
+        // Don't hack on volatile/atomic stores.
+        if (!SI->isSimple()) return true;
 
         // If this is a direct store to the global (i.e., the global is a scalar
         // value, not an aggregate), keep more specific information about
@@ -281,18 +283,18 @@ static Constant *getAggregateConstantElement(Constant *Agg, Constant *Idx) {
   } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Agg)) {
     if (IdxV < CP->getNumOperands()) return CP->getOperand(IdxV);
   } else if (isa<ConstantAggregateZero>(Agg)) {
-    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+    if (StructType *STy = dyn_cast<StructType>(Agg->getType())) {
       if (IdxV < STy->getNumElements())
         return Constant::getNullValue(STy->getElementType(IdxV));
-    } else if (const SequentialType *STy =
+    } else if (SequentialType *STy =
                dyn_cast<SequentialType>(Agg->getType())) {
       return Constant::getNullValue(STy->getElementType());
     }
   } else if (isa<UndefValue>(Agg)) {
-    if (const StructType *STy = dyn_cast<StructType>(Agg->getType())) {
+    if (StructType *STy = dyn_cast<StructType>(Agg->getType())) {
       if (IdxV < STy->getNumElements())
         return UndefValue::get(STy->getElementType(IdxV));
-    } else if (const SequentialType *STy =
+    } else if (SequentialType *STy =
                dyn_cast<SequentialType>(Agg->getType())) {
       return UndefValue::get(STy->getElementType());
     }
@@ -430,7 +432,7 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
   ++GEPI;  // Skip over the pointer index.
 
   // If this is a use of an array allocation, do a bit more checking for sanity.
-  if (const ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(*GEPI)) {
     uint64_t NumElements = AT->getNumElements();
     ConstantInt *Idx = cast<ConstantInt>(U->getOperand(2));
 
@@ -451,9 +453,9 @@ static bool IsUserOfGlobalSafeForSRA(User *U, GlobalValue *GV) {
          GEPI != E;
          ++GEPI) {
       uint64_t NumElements;
-      if (const ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
+      if (ArrayType *SubArrayTy = dyn_cast<ArrayType>(*GEPI))
         NumElements = SubArrayTy->getNumElements();
-      else if (const VectorType *SubVectorTy = dyn_cast<VectorType>(*GEPI))
+      else if (VectorType *SubVectorTy = dyn_cast<VectorType>(*GEPI))
         NumElements = SubVectorTy->getNumElements();
       else {
         assert((*GEPI)->isStructTy() &&
@@ -498,7 +500,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
 
   assert(GV->hasLocalLinkage() && !GV->isConstant());
   Constant *Init = GV->getInitializer();
-  const Type *Ty = Init->getType();
+  Type *Ty = Init->getType();
 
   std::vector<GlobalVariable*> NewGlobals;
   Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
@@ -508,7 +510,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
   if (StartAlignment == 0)
     StartAlignment = TD.getABITypeAlignment(GV->getType());
 
-  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
     NewGlobals.reserve(STy->getNumElements());
     const StructLayout &Layout = *TD.getStructLayout(STy);
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
@@ -531,9 +533,9 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
       if (NewAlign > TD.getABITypeAlignment(STy->getElementType(i)))
         NGV->setAlignment(NewAlign);
     }
-  } else if (const SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+  } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
     unsigned NumElements = 0;
-    if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+    if (ArrayType *ATy = dyn_cast<ArrayType>(STy))
       NumElements = ATy->getNumElements();
     else
       NumElements = cast<VectorType>(STy)->getNumElements();
@@ -596,15 +598,14 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
         Idxs.push_back(NullInt);
         for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
           Idxs.push_back(CE->getOperand(i));
-        NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr),
-                                                &Idxs[0], Idxs.size());
+        NewPtr = ConstantExpr::getGetElementPtr(cast<Constant>(NewPtr), Idxs);
       } else {
         GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
         SmallVector<Value*, 8> Idxs;
         Idxs.push_back(NullInt);
         for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
           Idxs.push_back(GEPI->getOperand(i));
-        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs.begin(), Idxs.end(),
+        NewPtr = GetElementPtrInst::Create(NewPtr, Idxs,
                                            GEPI->getName()+"."+Twine(Val),GEPI);
       }
     }
@@ -753,8 +754,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
           break;
       if (Idxs.size() == GEPI->getNumOperands()-1)
         Changed |= OptimizeAwayTrappingUsesOfValue(GEPI,
-                          ConstantExpr::getGetElementPtr(NewV, &Idxs[0],
-                                                        Idxs.size()));
+                          ConstantExpr::getGetElementPtr(NewV, Idxs));
       if (GEPI->use_empty()) {
         Changed = true;
         GEPI->eraseFromParent();
@@ -846,12 +846,12 @@ static void ConstantPropUsersOf(Value *V) {
 /// malloc into a global, and any loads of GV as uses of the new global.
 static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                                      CallInst *CI,
-                                                     const Type *AllocTy,
+                                                     Type *AllocTy,
                                                      ConstantInt *NElements,
                                                      TargetData* TD) {
   DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
 
-  const Type *GlobalType;
+  Type *GlobalType;
   if (NElements->getZExtValue() == 1)
     GlobalType = AllocTy;
   else
@@ -1192,7 +1192,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
     // field.
-    const StructType *ST =
+    StructType *ST =
       cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
 
     PHINode *NewPN =
@@ -1245,8 +1245,7 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
     GEPIdx.push_back(GEPI->getOperand(1));
     GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
 
-    Value *NGEPI = GetElementPtrInst::Create(NewPtr,
-                                             GEPIdx.begin(), GEPIdx.end(),
+    Value *NGEPI = GetElementPtrInst::Create(NewPtr, GEPIdx,
                                              GEPI->getName(), GEPI);
     GEPI->replaceAllUsesWith(NGEPI);
     GEPI->eraseFromParent();
@@ -1260,11 +1259,9 @@ static void RewriteHeapSROALoadUser(Instruction *LoadUser,
   // already been seen first by another load, so its uses have already been
   // processed.
   PHINode *PN = cast<PHINode>(LoadUser);
-  bool Inserted;
-  DenseMap<Value*, std::vector<Value*> >::iterator InsertPos;
-  tie(InsertPos, Inserted) =
-    InsertedScalarizedValues.insert(std::make_pair(PN, std::vector<Value*>()));
-  if (!Inserted) return;
+  if (!InsertedScalarizedValues.insert(std::make_pair(PN,
+                                              std::vector<Value*>())).second)
+    return;
 
   // If this is the first time we've seen this PHI, recursively process all
   // users.
@@ -1298,8 +1295,8 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
                                             Value* NElems, TargetData *TD) {
   DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
-  const Type* MAT = getMallocAllocatedType(CI);
-  const StructType *STy = cast<StructType>(MAT);
+  Type* MAT = getMallocAllocatedType(CI);
+  StructType *STy = cast<StructType>(MAT);
 
   // There is guaranteed to be at least one use of the malloc (storing
   // it into GV).  If there are other uses, change them to be uses of
@@ -1313,8 +1310,8 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   std::vector<Value*> FieldMallocs;
 
   for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
-    const Type *FieldTy = STy->getElementType(FieldNo);
-    const PointerType *PFieldTy = PointerType::getUnqual(FieldTy);
+    Type *FieldTy = STy->getElementType(FieldNo);
+    PointerType *PFieldTy = PointerType::getUnqual(FieldTy);
 
     GlobalVariable *NGV =
       new GlobalVariable(*GV->getParent(),
@@ -1325,9 +1322,9 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     FieldGlobals.push_back(NGV);
 
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
-    if (const StructType *ST = dyn_cast<StructType>(FieldTy))
+    if (StructType *ST = dyn_cast<StructType>(FieldTy))
       TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
-    const Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems, 0,
@@ -1379,8 +1376,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
   for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
     Value *GVVal = new LoadInst(FieldGlobals[i], "tmp", NullPtrBlock);
     Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
-                              Constant::getNullValue(GVVal->getType()),
-                              "tmp");
+                              Constant::getNullValue(GVVal->getType()));
     BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
                                                OrigBB->getParent());
     BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
@@ -1428,7 +1424,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 
     // Insert a store of null into each global.
     for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
-      const PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
+      PointerType *PT = cast<PointerType>(FieldGlobals[i]->getType());
       Constant *Null = Constant::getNullValue(PT->getElementType());
       new StoreInst(Null, FieldGlobals[i], SI);
     }
@@ -1485,7 +1481,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
 /// cast of malloc.
 static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
                                                CallInst *CI,
-                                               const Type *AllocTy,
+                                               Type *AllocTy,
                                                Module::global_iterator &GVI,
                                                TargetData *TD) {
   if (!TD)
@@ -1538,10 +1534,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
   // If this is an allocation of a fixed size array of structs, analyze as a
   // variable size array.  malloc [100 x struct],1 -> malloc struct, 100
   if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
-    if (const ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+    if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
       AllocTy = AT->getElementType();
 
-  const StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
+  StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
   if (!AllocSTy)
     return false;
 
@@ -1552,8 +1548,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
-    if (const ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
-      const Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
+      Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
@@ -1596,7 +1592,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC))
         return true;
     } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
-      const Type* MallocType = getMallocAllocatedType(CI);
+      Type* MallocType = getMallocAllocatedType(CI);
       if (MallocType && TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
                                                            GVI, TD))
         return true;
@@ -1611,7 +1607,7 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
 /// can shrink the global into a boolean and select between the two values
 /// whenever it is used.  This exposes the values to other scalar optimizations.
 static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
-  const Type *GVElType = GV->getType()->getElementType();
+  Type *GVElType = GV->getType()->getElementType();
 
   // If GVElType is already i1, it is already shrunk.  If the type of the GV is
   // an FP value, pointer or vector, don't do this optimization because a select
@@ -1761,7 +1757,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV);
     Instruction& FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
-    const Type* ElemTy = GV->getType()->getElementType();
+    Type* ElemTy = GV->getType()->getElementType();
     // FIXME: Pass Global's alignment when globals have alignment
     AllocaInst* Alloca = new AllocaInst(ElemTy, NULL, GV->getName(), &FirstI);
     if (!isa<UndefValue>(GV->getInitializer()))
@@ -2003,7 +1999,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()), 65535);
   CSVals[1] = 0;
 
-  const StructType *StructTy =
+  StructType *StructTy =
     cast <StructType>(
     cast<ArrayType>(GCL->getType()->getElementType())->getElementType());
 
@@ -2013,9 +2009,9 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
     if (Ctors[i]) {
       CSVals[1] = Ctors[i];
     } else {
-      const Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()),
+      Type *FTy = FunctionType::get(Type::getVoidTy(GCL->getContext()),
                                           false);
-      const PointerType *PFTy = PointerType::getUnqual(FTy);
+      PointerType *PFTy = PointerType::getUnqual(FTy);
       CSVals[1] = Constant::getNullValue(PFTy);
       CSVals[0] = ConstantInt::get(Type::getInt32Ty(GCL->getContext()),
                                    0x7fffffff);
@@ -2196,7 +2192,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
   }
 
   std::vector<Constant*> Elts;
-  if (const StructType *STy = dyn_cast<StructType>(Init->getType())) {
+  if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
 
     // Break up the constant into its elements.
     if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Init)) {
@@ -2224,10 +2220,10 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
   }
   
   ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
-  const SequentialType *InitTy = cast<SequentialType>(Init->getType());
+  SequentialType *InitTy = cast<SequentialType>(Init->getType());
 
   uint64_t NumElts;
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(InitTy))
+  if (ArrayType *ATy = dyn_cast<ArrayType>(InitTy))
     NumElts = ATy->getNumElements();
   else
     NumElts = cast<VectorType>(InitTy)->getNumElements();
@@ -2338,7 +2334,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
     Constant *InstResult = 0;
 
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
-      if (SI->isVolatile()) return false;  // no volatile accesses.
+      if (!SI->isSimple()) return false;  // no volatile/atomic accesses.
       Constant *Ptr = getVal(Values, SI->getOperand(1));
       if (!isSimpleEnoughPointerToCommit(Ptr))
         // If this is too complex for us to commit, reject it.
@@ -2358,7 +2354,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
           // stored value.
           Ptr = CE->getOperand(0);
           
-          const Type *NewTy=cast<PointerType>(Ptr->getType())->getElementType();
+          Type *NewTy=cast<PointerType>(Ptr->getType())->getElementType();
           
           // In order to push the bitcast onto the stored value, a bitcast
           // from NewTy to Val's type must be legal.  If it's not, we can try
@@ -2367,14 +2363,14 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
             // If NewTy is a struct, we can convert the pointer to the struct
             // into a pointer to its first member.
             // FIXME: This could be extended to support arrays as well.
-            if (const StructType *STy = dyn_cast<StructType>(NewTy)) {
+            if (StructType *STy = dyn_cast<StructType>(NewTy)) {
               NewTy = STy->getTypeAtIndex(0U);
 
-              const IntegerType *IdxTy =IntegerType::get(NewTy->getContext(), 32);
+              IntegerType *IdxTy =IntegerType::get(NewTy->getContext(), 32);
               Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
               Constant * const IdxList[] = {IdxZero, IdxZero};
 
-              Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList, 2);
+              Ptr = ConstantExpr::getGetElementPtr(Ptr, IdxList);
             
             // If we can't improve the situation by introspecting NewTy,
             // we have to give up.
@@ -2411,17 +2407,17 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
       for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
            i != e; ++i)
         GEPOps.push_back(getVal(Values, *i));
-      InstResult = cast<GEPOperator>(GEP)->isInBounds() ?
-          ConstantExpr::getInBoundsGetElementPtr(P, &GEPOps[0], GEPOps.size()) :
-          ConstantExpr::getGetElementPtr(P, &GEPOps[0], GEPOps.size());
+      InstResult =
+        ConstantExpr::getGetElementPtr(P, GEPOps,
+                                       cast<GEPOperator>(GEP)->isInBounds());
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
-      if (LI->isVolatile()) return false;  // no volatile accesses.
+      if (!LI->isSimple()) return false;  // no volatile/atomic accesses.
       InstResult = ComputeLoadResult(getVal(Values, LI->getOperand(0)),
                                      MutatedMemory);
       if (InstResult == 0) return false; // Could not evaluate load.
     } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
       if (AI->isArrayAllocation()) return false;  // Cannot handle array allocs.
-      const Type *Ty = AI->getType()->getElementType();
+      Type *Ty = AI->getType()->getElementType();
       AllocaTmps.push_back(new GlobalVariable(Ty, false,
                                               GlobalValue::InternalLinkage,
                                               UndefValue::get(Ty),
@@ -2465,8 +2461,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
 
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.
-        if (Constant *C = ConstantFoldCall(Callee, Formals.data(),
-                                           Formals.size())) {
+        if (Constant *C = ConstantFoldCall(Callee, Formals)) {
           InstResult = C;
         } else {
           return false;
@@ -2512,7 +2507,7 @@ static bool EvaluateFunction(Function *F, Constant *&RetVal,
         CallStack.pop_back();  // return from fn.
         return true;  // We succeeded at evaluating this ctor!
       } else {
-        // invoke, unwind, unreachable.
+        // invoke, unwind, resume, unreachable.
         return false;  // Cannot handle this terminator.
       }
 
@@ -2711,7 +2706,7 @@ static Function *FindCXAAtExit(Module &M) {
   if (!Fn)
     return 0;
   
-  const FunctionType *FTy = Fn->getFunctionType();
+  FunctionType *FTy = Fn->getFunctionType();
   
   // Checking that the function has the right return type, the right number of 
   // parameters and that they all have pointer types should be enough.
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index 25c0134..d757e1f 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -167,7 +167,7 @@ bool IPCP::PropagateConstantReturn(Function &F) {
     
   // Check to see if this function returns a constant.
   SmallVector<Value *,4> RetVals;
-  const StructType *STy = dyn_cast<StructType>(F.getReturnType());
+  StructType *STy = dyn_cast<StructType>(F.getReturnType());
   if (STy)
     for (unsigned i = 0, e = STy->getNumElements(); i < e; ++i) 
       RetVals.push_back(UndefValue::get(STy->getElementType(i)));
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index 31ce95f..6233922 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -13,6 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/IPO.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
@@ -35,7 +36,6 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeLoopExtractorPass(Registry);
   initializeBlockExtractorPassPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
-  initializeLowerSetJmpPass(Registry);
   initializeMergeFunctionsPass(Registry);
   initializePartialInlinerPass(Registry);
   initializePruneEHPass(Registry);
@@ -70,6 +70,10 @@ void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createFunctionInliningPass());
 }
 
+void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(llvm::createAlwaysInlinerPass());
+}
+
 void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createGlobalDCEPass());
 }
@@ -82,10 +86,6 @@ void LLVMAddIPConstantPropagationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createIPConstantPropagationPass());
 }
 
-void LLVMAddLowerSetJmpPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createLowerSetJmpPass());
-}
-
 void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPruneEHPass());
 }
@@ -98,11 +98,6 @@ void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
   unwrap(PM)->add(createInternalizePass(AllButMain != 0));
 }
 
-
-void LLVMAddRaiseAllocationsPass(LLVMPassManagerRef PM) {
-  // FIXME: Remove in LLVM 3.0.
-}
-
 void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createStripDeadPrototypesPass());
 }
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index ce795b7..c0426da 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
@@ -32,10 +33,10 @@ namespace {
   // AlwaysInliner only inlines functions that are mark as "always inline".
   class AlwaysInliner : public Inliner {
     // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline; 
+    SmallPtrSet<const Function*, 16> NeverInline;
     InlineCostAnalyzer CA;
   public:
-    // Use extremely low threshold. 
+    // Use extremely low threshold.
     AlwaysInliner() : Inliner(ID, -2000000000) {
       initializeAlwaysInlinerPass(*PassRegistry::getPassRegistry());
     }
@@ -52,8 +53,8 @@ namespace {
     void growCachedCostInfo(Function* Caller, Function* Callee) {
       CA.growCachedCostInfo(Caller, Callee);
     }
-    virtual bool doFinalization(CallGraph &CG) { 
-      return removeDeadFunctions(CG, &NeverInline); 
+    virtual bool doFinalization(CallGraph &CG) {
+      return removeDeadFunctions(CG, &NeverInline);
     }
     virtual bool doInitialization(CallGraph &CG);
     void releaseMemory() {
@@ -71,11 +72,13 @@ INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
 
 Pass *llvm::createAlwaysInlinerPass() { return new AlwaysInliner(); }
 
-// doInitialization - Initializes the vector of functions that have not 
+// doInitialization - Initializes the vector of functions that have not
 // been annotated with the "always inline" attribute.
 bool AlwaysInliner::doInitialization(CallGraph &CG) {
+  CA.setTargetData(getAnalysisIfAvailable<TargetData>());
+
   Module &M = CG.getModule();
-  
+
   for (Module::iterator I = M.begin(), E = M.end();
        I != E; ++I)
     if (!I->isDeclaration() && !I->hasFnAttr(Attribute::AlwaysInline))
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 0c5b3be..84dd4fd 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
@@ -30,7 +31,7 @@ namespace {
 
   class SimpleInliner : public Inliner {
     // Functions that are never inlined
-    SmallPtrSet<const Function*, 16> NeverInline; 
+    SmallPtrSet<const Function*, 16> NeverInline;
     InlineCostAnalyzer CA;
   public:
     SimpleInliner() : Inliner(ID) {
@@ -68,16 +69,17 @@ INITIALIZE_PASS_END(SimpleInliner, "inline",
 
 Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
 
-Pass *llvm::createFunctionInliningPass(int Threshold) { 
+Pass *llvm::createFunctionInliningPass(int Threshold) {
   return new SimpleInliner(Threshold);
 }
 
 // doInitialization - Initializes the vector of functions that have been
 // annotated with the noinline attribute.
 bool SimpleInliner::doInitialization(CallGraph &CG) {
-  
+  CA.setTargetData(getAnalysisIfAvailable<TargetData>());
+
   Module &M = CG.getModule();
-  
+
   for (Module::iterator I = M.begin(), E = M.end();
        I != E; ++I)
     if (!I->isDeclaration() && I->hasFnAttr(Attribute::NoInline))
@@ -85,34 +87,34 @@ bool SimpleInliner::doInitialization(CallGraph &CG) {
 
   // Get llvm.noinline
   GlobalVariable *GV = M.getNamedGlobal("llvm.noinline");
-  
+
   if (GV == 0)
     return false;
 
   // Don't crash on invalid code
   if (!GV->hasDefinitiveInitializer())
     return false;
-  
+
   const ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  
+
   if (InitList == 0)
     return false;
 
   // Iterate over each element and add to the NeverInline set
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
-        
+
     // Get Source
     const Constant *Elt = InitList->getOperand(i);
-        
+
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(Elt))
-      if (CE->getOpcode() == Instruction::BitCast) 
+      if (CE->getOpcode() == Instruction::BitCast)
         Elt = CE->getOperand(0);
-    
+
     // Insert into set of functions to never inline
     if (const Function *F = dyn_cast<Function>(Elt))
       NeverInline.insert(F);
   }
-  
+
   return false;
 }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 57f3e77..f00935b 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -62,7 +62,7 @@ void Inliner::getAnalysisUsage(AnalysisUsage &Info) const {
 }
 
 
-typedef DenseMap<const ArrayType*, std::vector<AllocaInst*> >
+typedef DenseMap<ArrayType*, std::vector<AllocaInst*> >
 InlinedArrayAllocasTy;
 
 /// InlineCallIfPossible - If it is possible to inline the specified call site,
@@ -139,7 +139,7 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
     // Don't bother trying to merge array allocations (they will usually be
     // canonicalized to be an allocation *of* an array), or allocations whose
     // type is not itself an array (because we're afraid of pessimizing SRoA).
-    const ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
+    ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
     if (ATy == 0 || AI->isArrayAllocation())
       continue;
     
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 848944d..4f96afe4 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/FunctionUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include <fstream>
@@ -53,12 +54,12 @@ namespace {
 
 char LoopExtractor::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract",
-                "Extract loops into new functions", false, false)
+                      "Extract loops into new functions", false, false)
 INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_END(LoopExtractor, "loop-extract",
-                "Extract loops into new functions", false, false)
+                    "Extract loops into new functions", false, false)
 
 namespace {
   /// SingleLoopExtractor - For bugpoint.
@@ -100,9 +101,9 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
     L->getHeader()->getParent()->getEntryBlock().getTerminator();
   if (!isa<BranchInst>(EntryTI) ||
       !cast<BranchInst>(EntryTI)->isUnconditional() ||
-      EntryTI->getSuccessor(0) != L->getHeader())
+      EntryTI->getSuccessor(0) != L->getHeader()) {
     ShouldExtractLoop = true;
-  else {
+  } else {
     // Check to see if any exits from the loop are more than just return
     // blocks.
     SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -113,6 +114,21 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
         break;
       }
   }
+
+  if (ShouldExtractLoop) {
+    // We must omit landing pads. Landing pads must accompany the invoke
+    // instruction. But this would result in a loop in the extracted
+    // function. An infinite cycle occurs when it tries to extract that loop as
+    // well.
+    SmallVector<BasicBlock*, 8> ExitBlocks;
+    L->getExitBlocks(ExitBlocks);
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+      if (ExitBlocks[i]->isLandingPad()) {
+        ShouldExtractLoop = false;
+        break;
+      }
+  }
+
   if (ShouldExtractLoop) {
     if (NumLoops == 0) return Changed;
     --NumLoops;
@@ -149,6 +165,7 @@ namespace {
   /// BlocksToNotExtract list.
   class BlockExtractorPass : public ModulePass {
     void LoadFile(const char *Filename);
+    void SplitLandingPadPreds(Function *F);
 
     std::vector<BasicBlock*> BlocksToNotExtract;
     std::vector<std::pair<std::string, std::string> > BlocksToNotExtractByName;
@@ -171,8 +188,7 @@ INITIALIZE_PASS(BlockExtractorPass, "extract-blocks",
 // createBlockExtractorPass - This pass extracts all blocks (except those
 // specified in the argument list) from the functions in the module.
 //
-ModulePass *llvm::createBlockExtractorPass()
-{
+ModulePass *llvm::createBlockExtractorPass() {
   return new BlockExtractorPass();
 }
 
@@ -194,6 +210,37 @@ void BlockExtractorPass::LoadFile(const char *Filename) {
   }
 }
 
+/// SplitLandingPadPreds - The landing pad needs to be extracted with the invoke
+/// instruction. The critical edge breaker will refuse to break critical edges
+/// to a landing pad. So do them here. After this method runs, all landing pads
+/// should have only one predecessor.
+void BlockExtractorPass::SplitLandingPadPreds(Function *F) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    if (!II) continue;
+    BasicBlock *Parent = II->getParent();
+    BasicBlock *LPad = II->getUnwindDest();
+
+    // Look through the landing pad's predecessors. If one of them ends in an
+    // 'invoke', then we want to split the landing pad.
+    bool Split = false;
+    for (pred_iterator
+           PI = pred_begin(LPad), PE = pred_end(LPad); PI != PE; ++PI) {
+      BasicBlock *BB = *PI;
+      if (BB->isLandingPad() && BB != Parent &&
+          isa<InvokeInst>(Parent->getTerminator())) {
+        Split = true;
+        break;
+      }
+    }
+
+    if (!Split) continue;
+
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", 0, NewBBs);
+  }
+}
+
 bool BlockExtractorPass::runOnModule(Module &M) {
   std::set<BasicBlock*> TranslatedBlocksToNotExtract;
   for (unsigned i = 0, e = BlocksToNotExtract.size(); i != e; ++i) {
@@ -236,13 +283,21 @@ bool BlockExtractorPass::runOnModule(Module &M) {
   // Now that we know which blocks to not extract, figure out which ones we WANT
   // to extract.
   std::vector<BasicBlock*> BlocksToExtract;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    SplitLandingPadPreds(&*F);
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       if (!TranslatedBlocksToNotExtract.count(BB))
         BlocksToExtract.push_back(BB);
+  }
 
-  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i)
-    ExtractBasicBlock(BlocksToExtract[i]);
+  for (unsigned i = 0, e = BlocksToExtract.size(); i != e; ++i) {
+    SmallVector<BasicBlock*, 2> BlocksToExtractVec;
+    BlocksToExtractVec.push_back(BlocksToExtract[i]);
+    if (const InvokeInst *II =
+        dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator()))
+      BlocksToExtractVec.push_back(II->getUnwindDest());
+    ExtractBasicBlock(BlocksToExtractVec);
+  }
 
   return !BlocksToExtract.empty();
 }
diff --git a/lib/Transforms/IPO/LowerSetJmp.cpp b/lib/Transforms/IPO/LowerSetJmp.cpp
deleted file mode 100644
index 659476b..0000000
--- a/lib/Transforms/IPO/LowerSetJmp.cpp
+++ /dev/null
@@ -1,547 +0,0 @@
-//===- LowerSetJmp.cpp - Code pertaining to lowering set/long jumps -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file implements the lowering of setjmp and longjmp to use the
-//  LLVM invoke and unwind instructions as necessary.
-//
-//  Lowering of longjmp is fairly trivial. We replace the call with a
-//  call to the LLVM library function "__llvm_sjljeh_throw_longjmp()".
-//  This unwinds the stack for us calling all of the destructors for
-//  objects allocated on the stack.
-//
-//  At a setjmp call, the basic block is split and the setjmp removed.
-//  The calls in a function that have a setjmp are converted to invoke
-//  where the except part checks to see if it's a longjmp exception and,
-//  if so, if it's handled in the function. If it is, then it gets the
-//  value returned by the longjmp and goes to where the basic block was
-//  split. Invoke instructions are handled in a similar fashion with the
-//  original except block being executed if it isn't a longjmp except
-//  that is handled by that function.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// FIXME: This pass doesn't deal with PHI statements just yet. That is,
-// we expect this to occur before SSAification is done. This would seem
-// to make sense, but in general, it might be a good idea to make this
-// pass invokable via the "opt" command at will.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "lowersetjmp"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/InstVisitor.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include <map>
-using namespace llvm;
-
-STATISTIC(LongJmpsTransformed, "Number of longjmps transformed");
-STATISTIC(SetJmpsTransformed , "Number of setjmps transformed");
-STATISTIC(CallsTransformed   , "Number of calls invokified");
-STATISTIC(InvokesTransformed , "Number of invokes modified");
-
-namespace {
-  //===--------------------------------------------------------------------===//
-  // LowerSetJmp pass implementation.
-  class LowerSetJmp : public ModulePass, public InstVisitor<LowerSetJmp> {
-    // LLVM library functions...
-    Constant *InitSJMap;        // __llvm_sjljeh_init_setjmpmap
-    Constant *DestroySJMap;     // __llvm_sjljeh_destroy_setjmpmap
-    Constant *AddSJToMap;       // __llvm_sjljeh_add_setjmp_to_map
-    Constant *ThrowLongJmp;     // __llvm_sjljeh_throw_longjmp
-    Constant *TryCatchLJ;       // __llvm_sjljeh_try_catching_longjmp_exception
-    Constant *IsLJException;    // __llvm_sjljeh_is_longjmp_exception
-    Constant *GetLJValue;       // __llvm_sjljeh_get_longjmp_value
-
-    typedef std::pair<SwitchInst*, CallInst*> SwitchValuePair;
-
-    // Keep track of those basic blocks reachable via a depth-first search of
-    // the CFG from a setjmp call. We only need to transform those "call" and
-    // "invoke" instructions that are reachable from the setjmp call site.
-    std::set<BasicBlock*> DFSBlocks;
-
-    // The setjmp map is going to hold information about which setjmps
-    // were called (each setjmp gets its own number) and with which
-    // buffer it was called.
-    std::map<Function*, AllocaInst*>            SJMap;
-
-    // The rethrow basic block map holds the basic block to branch to if
-    // the exception isn't handled in the current function and needs to
-    // be rethrown.
-    std::map<const Function*, BasicBlock*>      RethrowBBMap;
-
-    // The preliminary basic block map holds a basic block that grabs the
-    // exception and determines if it's handled by the current function.
-    std::map<const Function*, BasicBlock*>      PrelimBBMap;
-
-    // The switch/value map holds a switch inst/call inst pair. The
-    // switch inst controls which handler (if any) gets called and the
-    // value is the value returned to that handler by the call to
-    // __llvm_sjljeh_get_longjmp_value.
-    std::map<const Function*, SwitchValuePair>  SwitchValMap;
-
-    // A map of which setjmps we've seen so far in a function.
-    std::map<const Function*, unsigned>         SetJmpIDMap;
-
-    AllocaInst*     GetSetJmpMap(Function* Func);
-    BasicBlock*     GetRethrowBB(Function* Func);
-    SwitchValuePair GetSJSwitch(Function* Func, BasicBlock* Rethrow);
-
-    void TransformLongJmpCall(CallInst* Inst);
-    void TransformSetJmpCall(CallInst* Inst);
-
-    bool IsTransformableFunction(StringRef Name);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    LowerSetJmp() : ModulePass(ID) {
-      initializeLowerSetJmpPass(*PassRegistry::getPassRegistry());
-    }
-
-    void visitCallInst(CallInst& CI);
-    void visitInvokeInst(InvokeInst& II);
-    void visitReturnInst(ReturnInst& RI);
-    void visitUnwindInst(UnwindInst& UI);
-
-    bool runOnModule(Module& M);
-    bool doInitialization(Module& M);
-  };
-} // end anonymous namespace
-
-char LowerSetJmp::ID = 0;
-INITIALIZE_PASS(LowerSetJmp, "lowersetjmp", "Lower Set Jump", false, false)
-
-// run - Run the transformation on the program. We grab the function
-// prototypes for longjmp and setjmp. If they are used in the program,
-// then we can go directly to the places they're at and transform them.
-bool LowerSetJmp::runOnModule(Module& M) {
-  bool Changed = false;
-
-  // These are what the functions are called.
-  Function* SetJmp = M.getFunction("llvm.setjmp");
-  Function* LongJmp = M.getFunction("llvm.longjmp");
-
-  // This program doesn't have longjmp and setjmp calls.
-  if ((!LongJmp || LongJmp->use_empty()) &&
-        (!SetJmp || SetJmp->use_empty())) return false;
-
-  // Initialize some values and functions we'll need to transform the
-  // setjmp/longjmp functions.
-  doInitialization(M);
-
-  if (SetJmp) {
-    for (Value::use_iterator B = SetJmp->use_begin(), E = SetJmp->use_end();
-         B != E; ++B) {
-      BasicBlock* BB = cast<Instruction>(*B)->getParent();
-      for (df_ext_iterator<BasicBlock*> I = df_ext_begin(BB, DFSBlocks),
-             E = df_ext_end(BB, DFSBlocks); I != E; ++I)
-        /* empty */;
-    }
-
-    while (!SetJmp->use_empty()) {
-      assert(isa<CallInst>(SetJmp->use_back()) &&
-             "User of setjmp intrinsic not a call?");
-      TransformSetJmpCall(cast<CallInst>(SetJmp->use_back()));
-      Changed = true;
-    }
-  }
-
-  if (LongJmp)
-    while (!LongJmp->use_empty()) {
-      assert(isa<CallInst>(LongJmp->use_back()) &&
-             "User of longjmp intrinsic not a call?");
-      TransformLongJmpCall(cast<CallInst>(LongJmp->use_back()));
-      Changed = true;
-    }
-
-  // Now go through the affected functions and convert calls and invokes
-  // to new invokes...
-  for (std::map<Function*, AllocaInst*>::iterator
-      B = SJMap.begin(), E = SJMap.end(); B != E; ++B) {
-    Function* F = B->first;
-    for (Function::iterator BB = F->begin(), BE = F->end(); BB != BE; ++BB)
-      for (BasicBlock::iterator IB = BB->begin(), IE = BB->end(); IB != IE; ) {
-        visit(*IB++);
-        if (IB != BB->end() && IB->getParent() != BB)
-          break;  // The next instruction got moved to a different block!
-      }
-  }
-
-  DFSBlocks.clear();
-  SJMap.clear();
-  RethrowBBMap.clear();
-  PrelimBBMap.clear();
-  SwitchValMap.clear();
-  SetJmpIDMap.clear();
-
-  return Changed;
-}
-
-// doInitialization - For the lower long/setjmp pass, this ensures that a
-// module contains a declaration for the intrisic functions we are going
-// to call to convert longjmp and setjmp calls.
-//
-// This function is always successful, unless it isn't.
-bool LowerSetJmp::doInitialization(Module& M)
-{
-  const Type *SBPTy = Type::getInt8PtrTy(M.getContext());
-  const Type *SBPPTy = PointerType::getUnqual(SBPTy);
-
-  // N.B. See llvm/runtime/GCCLibraries/libexception/SJLJ-Exception.h for
-  // a description of the following library functions.
-
-  // void __llvm_sjljeh_init_setjmpmap(void**)
-  InitSJMap = M.getOrInsertFunction("__llvm_sjljeh_init_setjmpmap",
-                                    Type::getVoidTy(M.getContext()),
-                                    SBPPTy, (Type *)0);
-  // void __llvm_sjljeh_destroy_setjmpmap(void**)
-  DestroySJMap = M.getOrInsertFunction("__llvm_sjljeh_destroy_setjmpmap",
-                                       Type::getVoidTy(M.getContext()),
-                                       SBPPTy, (Type *)0);
-
-  // void __llvm_sjljeh_add_setjmp_to_map(void**, void*, unsigned)
-  AddSJToMap = M.getOrInsertFunction("__llvm_sjljeh_add_setjmp_to_map",
-                                     Type::getVoidTy(M.getContext()),
-                                     SBPPTy, SBPTy,
-                                     Type::getInt32Ty(M.getContext()),
-                                     (Type *)0);
-
-  // void __llvm_sjljeh_throw_longjmp(int*, int)
-  ThrowLongJmp = M.getOrInsertFunction("__llvm_sjljeh_throw_longjmp",
-                                       Type::getVoidTy(M.getContext()), SBPTy, 
-                                       Type::getInt32Ty(M.getContext()),
-                                       (Type *)0);
-
-  // unsigned __llvm_sjljeh_try_catching_longjmp_exception(void **)
-  TryCatchLJ =
-    M.getOrInsertFunction("__llvm_sjljeh_try_catching_longjmp_exception",
-                          Type::getInt32Ty(M.getContext()), SBPPTy, (Type *)0);
-
-  // bool __llvm_sjljeh_is_longjmp_exception()
-  IsLJException = M.getOrInsertFunction("__llvm_sjljeh_is_longjmp_exception",
-                                        Type::getInt1Ty(M.getContext()),
-                                        (Type *)0);
-
-  // int __llvm_sjljeh_get_longjmp_value()
-  GetLJValue = M.getOrInsertFunction("__llvm_sjljeh_get_longjmp_value",
-                                     Type::getInt32Ty(M.getContext()),
-                                     (Type *)0);
-  return true;
-}
-
-// IsTransformableFunction - Return true if the function name isn't one
-// of the ones we don't want transformed. Currently, don't transform any
-// "llvm.{setjmp,longjmp}" functions and none of the setjmp/longjmp error
-// handling functions (beginning with __llvm_sjljeh_...they don't throw
-// exceptions).
-bool LowerSetJmp::IsTransformableFunction(StringRef Name) {
-  return !Name.startswith("__llvm_sjljeh_");
-}
-
-// TransformLongJmpCall - Transform a longjmp call into a call to the
-// internal __llvm_sjljeh_throw_longjmp function. It then takes care of
-// throwing the exception for us.
-void LowerSetJmp::TransformLongJmpCall(CallInst* Inst)
-{
-  const Type* SBPTy = Type::getInt8PtrTy(Inst->getContext());
-
-  // Create the call to "__llvm_sjljeh_throw_longjmp". This takes the
-  // same parameters as "longjmp", except that the buffer is cast to a
-  // char*. It returns "void", so it doesn't need to replace any of
-  // Inst's uses and doesn't get a name.
-  CastInst* CI = 
-    new BitCastInst(Inst->getArgOperand(0), SBPTy, "LJBuf", Inst);
-  Value *Args[] = { CI, Inst->getArgOperand(1) };
-  CallInst::Create(ThrowLongJmp, Args, "", Inst);
-
-  SwitchValuePair& SVP = SwitchValMap[Inst->getParent()->getParent()];
-
-  // If the function has a setjmp call in it (they are transformed first)
-  // we should branch to the basic block that determines if this longjmp
-  // is applicable here. Otherwise, issue an unwind.
-  if (SVP.first)
-    BranchInst::Create(SVP.first->getParent(), Inst);
-  else
-    new UnwindInst(Inst->getContext(), Inst);
-
-  // Remove all insts after the branch/unwind inst.  Go from back to front to
-  // avoid replaceAllUsesWith if possible.
-  BasicBlock *BB = Inst->getParent();
-  Instruction *Removed;
-  do {
-    Removed = &BB->back();
-    // If the removed instructions have any users, replace them now.
-    if (!Removed->use_empty())
-      Removed->replaceAllUsesWith(UndefValue::get(Removed->getType()));
-    Removed->eraseFromParent();
-  } while (Removed != Inst);
-
-  ++LongJmpsTransformed;
-}
-
-// GetSetJmpMap - Retrieve (create and initialize, if necessary) the
-// setjmp map. This map is going to hold information about which setjmps
-// were called (each setjmp gets its own number) and with which buffer it
-// was called. There can be only one!
-AllocaInst* LowerSetJmp::GetSetJmpMap(Function* Func)
-{
-  if (SJMap[Func]) return SJMap[Func];
-
-  // Insert the setjmp map initialization before the first instruction in
-  // the function.
-  Instruction* Inst = Func->getEntryBlock().begin();
-  assert(Inst && "Couldn't find even ONE instruction in entry block!");
-
-  // Fill in the alloca and call to initialize the SJ map.
-  const Type *SBPTy =
-        Type::getInt8PtrTy(Func->getContext());
-  AllocaInst* Map = new AllocaInst(SBPTy, 0, "SJMap", Inst);
-  CallInst::Create(InitSJMap, Map, "", Inst);
-  return SJMap[Func] = Map;
-}
-
-// GetRethrowBB - Only one rethrow basic block is needed per function.
-// If this is a longjmp exception but not handled in this block, this BB
-// performs the rethrow.
-BasicBlock* LowerSetJmp::GetRethrowBB(Function* Func)
-{
-  if (RethrowBBMap[Func]) return RethrowBBMap[Func];
-
-  // The basic block we're going to jump to if we need to rethrow the
-  // exception.
-  BasicBlock* Rethrow =
-        BasicBlock::Create(Func->getContext(), "RethrowExcept", Func);
-
-  // Fill in the "Rethrow" BB with a call to rethrow the exception. This
-  // is the last instruction in the BB since at this point the runtime
-  // should exit this function and go to the next function.
-  new UnwindInst(Func->getContext(), Rethrow);
-  return RethrowBBMap[Func] = Rethrow;
-}
-
-// GetSJSwitch - Return the switch statement that controls which handler
-// (if any) gets called and the value returned to that handler.
-LowerSetJmp::SwitchValuePair LowerSetJmp::GetSJSwitch(Function* Func,
-                                                      BasicBlock* Rethrow)
-{
-  if (SwitchValMap[Func].first) return SwitchValMap[Func];
-
-  BasicBlock* LongJmpPre =
-        BasicBlock::Create(Func->getContext(), "LongJmpBlkPre", Func);
-
-  // Keep track of the preliminary basic block for some of the other
-  // transformations.
-  PrelimBBMap[Func] = LongJmpPre;
-
-  // Grab the exception.
-  CallInst* Cond = CallInst::Create(IsLJException, "IsLJExcept", LongJmpPre);
-
-  // The "decision basic block" gets the number associated with the
-  // setjmp call returning to switch on and the value returned by
-  // longjmp.
-  BasicBlock* DecisionBB =
-        BasicBlock::Create(Func->getContext(), "LJDecisionBB", Func);
-
-  BranchInst::Create(DecisionBB, Rethrow, Cond, LongJmpPre);
-
-  // Fill in the "decision" basic block.
-  CallInst* LJVal = CallInst::Create(GetLJValue, "LJVal", DecisionBB);
-  CallInst* SJNum = CallInst::Create(TryCatchLJ, GetSetJmpMap(Func), "SJNum",
-                                     DecisionBB);
-
-  SwitchInst* SI = SwitchInst::Create(SJNum, Rethrow, 0, DecisionBB);
-  return SwitchValMap[Func] = SwitchValuePair(SI, LJVal);
-}
-
-// TransformSetJmpCall - The setjmp call is a bit trickier to transform.
-// We're going to convert all setjmp calls to nops. Then all "call" and
-// "invoke" instructions in the function are converted to "invoke" where
-// the "except" branch is used when returning from a longjmp call.
-void LowerSetJmp::TransformSetJmpCall(CallInst* Inst)
-{
-  BasicBlock* ABlock = Inst->getParent();
-  Function* Func = ABlock->getParent();
-
-  // Add this setjmp to the setjmp map.
-  const Type* SBPTy =
-          Type::getInt8PtrTy(Inst->getContext());
-  CastInst* BufPtr = 
-    new BitCastInst(Inst->getArgOperand(0), SBPTy, "SBJmpBuf", Inst);
-  Value *Args[] = {
-    GetSetJmpMap(Func), BufPtr,
-    ConstantInt::get(Type::getInt32Ty(Inst->getContext()), SetJmpIDMap[Func]++)
-  };
-  CallInst::Create(AddSJToMap, Args, "", Inst);
-
-  // We are guaranteed that there are no values live across basic blocks
-  // (because we are "not in SSA form" yet), but there can still be values live
-  // in basic blocks.  Because of this, splitting the setjmp block can cause
-  // values above the setjmp to not dominate uses which are after the setjmp
-  // call.  For all of these occasions, we must spill the value to the stack.
-  //
-  std::set<Instruction*> InstrsAfterCall;
-
-  // The call is probably very close to the end of the basic block, for the
-  // common usage pattern of: 'if (setjmp(...))', so keep track of the
-  // instructions after the call.
-  for (BasicBlock::iterator I = ++BasicBlock::iterator(Inst), E = ABlock->end();
-       I != E; ++I)
-    InstrsAfterCall.insert(I);
-
-  for (BasicBlock::iterator II = ABlock->begin();
-       II != BasicBlock::iterator(Inst); ++II)
-    // Loop over all of the uses of instruction.  If any of them are after the
-    // call, "spill" the value to the stack.
-    for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
-         UI != E; ++UI) {
-      User *U = *UI;
-      if (cast<Instruction>(U)->getParent() != ABlock ||
-          InstrsAfterCall.count(cast<Instruction>(U))) {
-        DemoteRegToStack(*II);
-        break;
-      }
-    }
-  InstrsAfterCall.clear();
-
-  // Change the setjmp call into a branch statement. We'll remove the
-  // setjmp call in a little bit. No worries.
-  BasicBlock* SetJmpContBlock = ABlock->splitBasicBlock(Inst);
-  assert(SetJmpContBlock && "Couldn't split setjmp BB!!");
-
-  SetJmpContBlock->setName(ABlock->getName()+"SetJmpCont");
-
-  // Add the SetJmpContBlock to the set of blocks reachable from a setjmp.
-  DFSBlocks.insert(SetJmpContBlock);
-
-  // This PHI node will be in the new block created from the
-  // splitBasicBlock call.
-  PHINode* PHI = PHINode::Create(Type::getInt32Ty(Inst->getContext()), 2,
-                                 "SetJmpReturn", Inst);
-
-  // Coming from a call to setjmp, the return is 0.
-  PHI->addIncoming(Constant::getNullValue(Type::getInt32Ty(Inst->getContext())),
-                   ABlock);
-
-  // Add the case for this setjmp's number...
-  SwitchValuePair SVP = GetSJSwitch(Func, GetRethrowBB(Func));
-  SVP.first->addCase(ConstantInt::get(Type::getInt32Ty(Inst->getContext()),
-                                      SetJmpIDMap[Func] - 1),
-                     SetJmpContBlock);
-
-  // Value coming from the handling of the exception.
-  PHI->addIncoming(SVP.second, SVP.second->getParent());
-
-  // Replace all uses of this instruction with the PHI node created by
-  // the eradication of setjmp.
-  Inst->replaceAllUsesWith(PHI);
-  Inst->eraseFromParent();
-
-  ++SetJmpsTransformed;
-}
-
-// visitCallInst - This converts all LLVM call instructions into invoke
-// instructions. The except part of the invoke goes to the "LongJmpBlkPre"
-// that grabs the exception and proceeds to determine if it's a longjmp
-// exception or not.
-void LowerSetJmp::visitCallInst(CallInst& CI)
-{
-  if (CI.getCalledFunction())
-    if (!IsTransformableFunction(CI.getCalledFunction()->getName()) ||
-        CI.getCalledFunction()->isIntrinsic()) return;
-
-  BasicBlock* OldBB = CI.getParent();
-
-  // If not reachable from a setjmp call, don't transform.
-  if (!DFSBlocks.count(OldBB)) return;
-
-  BasicBlock* NewBB = OldBB->splitBasicBlock(CI);
-  assert(NewBB && "Couldn't split BB of \"call\" instruction!!");
-  DFSBlocks.insert(NewBB);
-  NewBB->setName("Call2Invoke");
-
-  Function* Func = OldBB->getParent();
-
-  // Construct the new "invoke" instruction.
-  TerminatorInst* Term = OldBB->getTerminator();
-  CallSite CS(&CI);
-  std::vector<Value*> Params(CS.arg_begin(), CS.arg_end());
-  InvokeInst* II =
-    InvokeInst::Create(CI.getCalledValue(), NewBB, PrelimBBMap[Func],
-                       Params, CI.getName(), Term);
-  II->setCallingConv(CI.getCallingConv());
-  II->setAttributes(CI.getAttributes());
-
-  // Replace the old call inst with the invoke inst and remove the call.
-  CI.replaceAllUsesWith(II);
-  CI.eraseFromParent();
-
-  // The old terminator is useless now that we have the invoke inst.
-  Term->eraseFromParent();
-  ++CallsTransformed;
-}
-
-// visitInvokeInst - Converting the "invoke" instruction is fairly
-// straight-forward. The old exception part is replaced by a query asking
-// if this is a longjmp exception. If it is, then it goes to the longjmp
-// exception blocks. Otherwise, control is passed the old exception.
-void LowerSetJmp::visitInvokeInst(InvokeInst& II)
-{
-  if (II.getCalledFunction())
-    if (!IsTransformableFunction(II.getCalledFunction()->getName()) ||
-        II.getCalledFunction()->isIntrinsic()) return;
-
-  BasicBlock* BB = II.getParent();
-
-  // If not reachable from a setjmp call, don't transform.
-  if (!DFSBlocks.count(BB)) return;
-
-  BasicBlock* ExceptBB = II.getUnwindDest();
-
-  Function* Func = BB->getParent();
-  BasicBlock* NewExceptBB = BasicBlock::Create(II.getContext(), 
-                                               "InvokeExcept", Func);
-
-  // If this is a longjmp exception, then branch to the preliminary BB of
-  // the longjmp exception handling. Otherwise, go to the old exception.
-  CallInst* IsLJExcept = CallInst::Create(IsLJException, "IsLJExcept",
-                                          NewExceptBB);
-
-  BranchInst::Create(PrelimBBMap[Func], ExceptBB, IsLJExcept, NewExceptBB);
-
-  II.setUnwindDest(NewExceptBB);
-  ++InvokesTransformed;
-}
-
-// visitReturnInst - We want to destroy the setjmp map upon exit from the
-// function.
-void LowerSetJmp::visitReturnInst(ReturnInst &RI) {
-  Function* Func = RI.getParent()->getParent();
-  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &RI);
-}
-
-// visitUnwindInst - We want to destroy the setjmp map upon exit from the
-// function.
-void LowerSetJmp::visitUnwindInst(UnwindInst &UI) {
-  Function* Func = UI.getParent()->getParent();
-  CallInst::Create(DestroySJMap, GetSetJmpMap(Func), "", &UI);
-}
-
-ModulePass *llvm::createLowerSetJmpPass() {
-  return new LowerSetJmp();
-}
-
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 7796d05..0b01c38 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -76,7 +76,7 @@ STATISTIC(NumDoubleWeak, "Number of new functions created");
 /// functions that will compare equal, without looking at the instructions
 /// inside the function.
 static unsigned profileFunction(const Function *F) {
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
 
   FoldingSetNodeID ID;
   ID.AddInteger(F->size());
@@ -185,7 +185,7 @@ private:
   }
 
   /// Compare two Types, treating all pointer types as equal.
-  bool isEquivalentType(const Type *Ty1, const Type *Ty2) const;
+  bool isEquivalentType(Type *Ty1, Type *Ty2) const;
 
   // The two functions undergoing comparison.
   const Function *F1, *F2;
@@ -200,8 +200,8 @@ private:
 
 // Any two pointers in the same address space are equivalent, intptr_t and
 // pointers are equivalent. Otherwise, standard type equivalence rules apply.
-bool FunctionComparator::isEquivalentType(const Type *Ty1,
-                                          const Type *Ty2) const {
+bool FunctionComparator::isEquivalentType(Type *Ty1,
+                                          Type *Ty2) const {
   if (Ty1 == Ty2)
     return true;
   if (Ty1->getTypeID() != Ty2->getTypeID()) {
@@ -233,14 +233,14 @@ bool FunctionComparator::isEquivalentType(const Type *Ty1,
     return true;
 
   case Type::PointerTyID: {
-    const PointerType *PTy1 = cast<PointerType>(Ty1);
-    const PointerType *PTy2 = cast<PointerType>(Ty2);
+    PointerType *PTy1 = cast<PointerType>(Ty1);
+    PointerType *PTy2 = cast<PointerType>(Ty2);
     return PTy1->getAddressSpace() == PTy2->getAddressSpace();
   }
 
   case Type::StructTyID: {
-    const StructType *STy1 = cast<StructType>(Ty1);
-    const StructType *STy2 = cast<StructType>(Ty2);
+    StructType *STy1 = cast<StructType>(Ty1);
+    StructType *STy2 = cast<StructType>(Ty2);
     if (STy1->getNumElements() != STy2->getNumElements())
       return false;
 
@@ -255,8 +255,8 @@ bool FunctionComparator::isEquivalentType(const Type *Ty1,
   }
 
   case Type::FunctionTyID: {
-    const FunctionType *FTy1 = cast<FunctionType>(Ty1);
-    const FunctionType *FTy2 = cast<FunctionType>(Ty2);
+    FunctionType *FTy1 = cast<FunctionType>(Ty1);
+    FunctionType *FTy2 = cast<FunctionType>(Ty2);
     if (FTy1->getNumParams() != FTy2->getNumParams() ||
         FTy1->isVarArg() != FTy2->isVarArg())
       return false;
@@ -272,8 +272,8 @@ bool FunctionComparator::isEquivalentType(const Type *Ty1,
   }
 
   case Type::ArrayTyID: {
-    const ArrayType *ATy1 = cast<ArrayType>(Ty1);
-    const ArrayType *ATy2 = cast<ArrayType>(Ty2);
+    ArrayType *ATy1 = cast<ArrayType>(Ty1);
+    ArrayType *ATy2 = cast<ArrayType>(Ty2);
     return ATy1->getNumElements() == ATy2->getNumElements() &&
            isEquivalentType(ATy1->getElementType(), ATy2->getElementType());
   }
@@ -305,10 +305,14 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
   // Check special state that is a part of some instructions.
   if (const LoadInst *LI = dyn_cast<LoadInst>(I1))
     return LI->isVolatile() == cast<LoadInst>(I2)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I2)->getAlignment();
+           LI->getAlignment() == cast<LoadInst>(I2)->getAlignment() &&
+           LI->getOrdering() == cast<LoadInst>(I2)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I2)->getSynchScope();
   if (const StoreInst *SI = dyn_cast<StoreInst>(I1))
     return SI->isVolatile() == cast<StoreInst>(I2)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment();
+           SI->getAlignment() == cast<StoreInst>(I2)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I2)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I2)->getSynchScope();
   if (const CmpInst *CI = dyn_cast<CmpInst>(I1))
     return CI->getPredicate() == cast<CmpInst>(I2)->getPredicate();
   if (const CallInst *CI = dyn_cast<CallInst>(I1))
@@ -317,22 +321,22 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
   if (const InvokeInst *CI = dyn_cast<InvokeInst>(I1))
     return CI->getCallingConv() == cast<InvokeInst>(I2)->getCallingConv() &&
            CI->getAttributes() == cast<InvokeInst>(I2)->getAttributes();
-  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1)) {
-    if (IVI->getNumIndices() != cast<InsertValueInst>(I2)->getNumIndices())
-      return false;
-    for (unsigned i = 0, e = IVI->getNumIndices(); i != e; ++i)
-      if (IVI->idx_begin()[i] != cast<InsertValueInst>(I2)->idx_begin()[i])
-        return false;
-    return true;
-  }
-  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1)) {
-    if (EVI->getNumIndices() != cast<ExtractValueInst>(I2)->getNumIndices())
-      return false;
-    for (unsigned i = 0, e = EVI->getNumIndices(); i != e; ++i)
-      if (EVI->idx_begin()[i] != cast<ExtractValueInst>(I2)->idx_begin()[i])
-        return false;
-    return true;
-  }
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(I1))
+    return IVI->getIndices() == cast<InsertValueInst>(I2)->getIndices();
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I1))
+    return EVI->getIndices() == cast<ExtractValueInst>(I2)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(I1))
+    return FI->getOrdering() == cast<FenceInst>(I2)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(I2)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I1))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I2)->isVolatile() &&
+           CXI->getOrdering() == cast<AtomicCmpXchgInst>(I2)->getOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I2)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I1))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I2)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I2)->getSynchScope();
 
   return true;
 }
@@ -346,9 +350,9 @@ bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
     SmallVector<Value *, 8> Indices1(GEP1->idx_begin(), GEP1->idx_end());
     SmallVector<Value *, 8> Indices2(GEP2->idx_begin(), GEP2->idx_end());
     uint64_t Offset1 = TD->getIndexedOffset(GEP1->getPointerOperandType(),
-                                            Indices1.data(), Indices1.size());
+                                            Indices1);
     uint64_t Offset2 = TD->getIndexedOffset(GEP2->getPointerOperandType(),
-                                            Indices2.data(), Indices2.size());
+                                            Indices2);
     return Offset1 == Offset2;
   }
 
@@ -725,7 +729,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
 
   SmallVector<Value *, 16> Args;
   unsigned i = 0;
-  const FunctionType *FFTy = F->getFunctionType();
+  FunctionType *FFTy = F->getFunctionType();
   for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
        AI != AE; ++AI) {
     Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i)));
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
new file mode 100644
index 0000000..8fdfd72
--- /dev/null
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -0,0 +1,343 @@
+//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+
+#include "llvm/PassManager.h"
+#include "llvm/DefaultPasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+
+PassManagerBuilder::PassManagerBuilder() {
+    OptLevel = 2;
+    SizeLevel = 0;
+    LibraryInfo = 0;
+    Inliner = 0;
+    DisableSimplifyLibCalls = false;
+    DisableUnitAtATime = false;
+    DisableUnrollLoops = false;
+}
+
+PassManagerBuilder::~PassManagerBuilder() {
+  delete LibraryInfo;
+  delete Inliner;
+}
+
+/// Set of global extensions, automatically added as part of the standard set.
+static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
+   PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions;
+
+void PassManagerBuilder::addGlobalExtension(
+    PassManagerBuilder::ExtensionPointTy Ty,
+    PassManagerBuilder::ExtensionFn Fn) {
+  GlobalExtensions->push_back(std::make_pair(Ty, Fn));
+}
+
+void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+  Extensions.push_back(std::make_pair(Ty, Fn));
+}
+
+void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
+                                           PassManagerBase &PM) const {
+  for (unsigned i = 0, e = GlobalExtensions->size(); i != e; ++i)
+    if ((*GlobalExtensions)[i].first == ETy)
+      (*GlobalExtensions)[i].second(*this, PM);
+  for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+    if (Extensions[i].first == ETy)
+      Extensions[i].second(*this, PM);
+}
+
+void
+PassManagerBuilder::addInitialAliasAnalysisPasses(PassManagerBase &PM) const {
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  PM.add(createTypeBasedAliasAnalysisPass());
+  PM.add(createBasicAliasAnalysisPass());
+}
+
+void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) {
+  addExtensionsToPM(EP_EarlyAsPossible, FPM);
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo) FPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+  if (OptLevel == 0) return;
+
+  addInitialAliasAnalysisPasses(FPM);
+
+  FPM.add(createCFGSimplificationPass());
+  FPM.add(createScalarReplAggregatesPass());
+  FPM.add(createEarlyCSEPass());
+  FPM.add(createLowerExpectIntrinsicPass());
+}
+
+void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
+  // If all optimizations are disabled, just run the always-inline pass.
+  if (OptLevel == 0) {
+    if (Inliner) {
+      MPM.add(Inliner);
+      Inliner = 0;
+    }
+    return;
+  }
+
+  // Add LibraryInfo if we have some.
+  if (LibraryInfo) MPM.add(new TargetLibraryInfo(*LibraryInfo));
+
+  addInitialAliasAnalysisPasses(MPM);
+
+  if (!DisableUnitAtATime) {
+    MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
+
+    MPM.add(createIPSCCPPass());              // IP SCCP
+    MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
+
+    MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
+    MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
+  }
+
+  // Start of CallGraph SCC passes.
+  if (!DisableUnitAtATime)
+    MPM.add(createPruneEHPass());             // Remove dead EH info
+  if (Inliner) {
+    MPM.add(Inliner);
+    Inliner = 0;
+  }
+  if (!DisableUnitAtATime)
+    MPM.add(createFunctionAttrsPass());       // Set readonly/readnone attrs
+  if (OptLevel > 2)
+    MPM.add(createArgumentPromotionPass());   // Scalarize uninlined fn args
+
+  // Start of function pass.
+  // Break up aggregate allocas, using SSAUpdater.
+  MPM.add(createScalarReplAggregatesPass(-1, false));
+  MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+  if (!DisableSimplifyLibCalls)
+    MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
+  MPM.add(createJumpThreadingPass());         // Thread jumps.
+  MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createInstructionCombiningPass());  // Combine silly seq's
+
+  MPM.add(createTailCallEliminationPass());   // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createReassociatePass());           // Reassociate expressions
+  MPM.add(createLoopRotatePass());            // Rotate Loop
+  MPM.add(createLICMPass());                  // Hoist loop invariants
+  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+  MPM.add(createInstructionCombiningPass());
+  MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+  MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
+  MPM.add(createLoopDeletionPass());          // Delete dead loops
+  if (!DisableUnrollLoops)
+    MPM.add(createLoopUnrollPass());          // Unroll small loops
+  addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+
+  if (OptLevel > 1)
+    MPM.add(createGVNPass());                 // Remove redundancies
+  MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
+  MPM.add(createSCCPPass());                  // Constant prop with SCCP
+
+  // Run instcombine after redundancy elimination to exploit opportunities
+  // opened up by them.
+  MPM.add(createInstructionCombiningPass());
+  MPM.add(createJumpThreadingPass());         // Thread jumps
+  MPM.add(createCorrelatedValuePropagationPass());
+  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+
+  addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
+
+  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
+  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createInstructionCombiningPass());  // Clean up after everything.
+
+  if (!DisableUnitAtATime) {
+    // FIXME: We shouldn't bother with this anymore.
+    MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+
+    // GlobalOpt already deletes dead functions and globals, at -O3 try a
+    // late pass of GlobalDCE.  It is capable of deleting dead cycles.
+    if (OptLevel > 2)
+      MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
+
+    if (OptLevel > 1)
+      MPM.add(createConstantMergePass());     // Merge dup global constants
+  }
+}
+
+void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
+                                                bool Internalize,
+                                                bool RunInliner) {
+  // Provide AliasAnalysis services for optimizations.
+  addInitialAliasAnalysisPasses(PM);
+
+  // Now that composite has been compiled, scan through the module, looking
+  // for a main function.  If main is defined, mark all other functions
+  // internal.
+  if (Internalize)
+    PM.add(createInternalizePass(true));
+
+  // Propagate constants at call sites into the functions they call.  This
+  // opens opportunities for globalopt (and inlining) by substituting function
+  // pointers passed as arguments to direct uses of functions.
+  PM.add(createIPSCCPPass());
+
+  // Now that we internalized some globals, see if we can hack on them!
+  PM.add(createGlobalOptimizerPass());
+
+  // Linking modules together can lead to duplicated global constants, only
+  // keep one copy of each constant.
+  PM.add(createConstantMergePass());
+
+  // Remove unused arguments from functions.
+  PM.add(createDeadArgEliminationPass());
+
+  // Reduce the code after globalopt and ipsccp.  Both can open up significant
+  // simplification opportunities, and both can propagate functions through
+  // function pointers.  When this happens, we often have to resolve varargs
+  // calls, etc, so let instcombine do this.
+  PM.add(createInstructionCombiningPass());
+
+  // Inline small functions
+  if (RunInliner)
+    PM.add(createFunctionInliningPass());
+
+  PM.add(createPruneEHPass());   // Remove dead EH info.
+
+  // Optimize globals again if we ran the inliner.
+  if (RunInliner)
+    PM.add(createGlobalOptimizerPass());
+  PM.add(createGlobalDCEPass()); // Remove dead functions.
+
+  // If we didn't decide to inline a function, check to see if we can
+  // transform it to pass arguments by value instead of by reference.
+  PM.add(createArgumentPromotionPass());
+
+  // The IPO passes may leave cruft around.  Clean up after them.
+  PM.add(createInstructionCombiningPass());
+  PM.add(createJumpThreadingPass());
+  // Break up allocas
+  PM.add(createScalarReplAggregatesPass());
+
+  // Run a few AA driven optimizations here and now, to cleanup the code.
+  PM.add(createFunctionAttrsPass()); // Add nocapture.
+  PM.add(createGlobalsModRefPass()); // IP alias analysis.
+
+  PM.add(createLICMPass());      // Hoist loop invariants.
+  PM.add(createGVNPass());       // Remove redundancies.
+  PM.add(createMemCpyOptPass()); // Remove dead memcpys.
+  // Nuke dead stores.
+  PM.add(createDeadStoreEliminationPass());
+
+  // Cleanup and simplify the code after the scalar optimizations.
+  PM.add(createInstructionCombiningPass());
+
+  PM.add(createJumpThreadingPass());
+
+  // Delete basic blocks, which optimization passes may have killed.
+  PM.add(createCFGSimplificationPass());
+
+  // Now that we have optimized the program, discard unreachable functions.
+  PM.add(createGlobalDCEPass());
+}
+
+LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate(void) {
+  PassManagerBuilder *PMB = new PassManagerBuilder();
+  return wrap(PMB);
+}
+
+void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  delete Builder;
+}
+
+void
+LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB,
+                                  unsigned OptLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->OptLevel = OptLevel;
+}
+
+void
+LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
+                                   unsigned SizeLevel) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->SizeLevel = SizeLevel;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnitAtATime = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
+                                            LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableUnrollLoops = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
+                                                 LLVMBool Value) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->DisableSimplifyLibCalls = Value;
+}
+
+void
+LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB,
+                                              unsigned Threshold) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  Builder->Inliner = createFunctionInliningPass(Threshold);
+}
+
+void
+LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  FunctionPassManager *FPM = unwrap<FunctionPassManager>(PM);
+  Builder->populateFunctionPassManager(*FPM);
+}
+
+void
+LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
+                                                LLVMPassManagerRef PM) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  PassManagerBase *MPM = unwrap(PM);
+  Builder->populateModulePassManager(*MPM);
+}
+
+void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
+                                                  LLVMPassManagerRef PM,
+                                                  bool Internalize,
+                                                  bool RunInliner) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  PassManagerBase *LPM = unwrap(PM);
+  Builder->populateLTOPassManager(*LPM, Internalize, RunInliner);
+}
+
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index b7e63dc..cbb80f0 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -101,8 +101,9 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
       // Check to see if this function performs an unwind or calls an
       // unwinding function.
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        if (CheckUnwind && isa<UnwindInst>(BB->getTerminator())) {
-          // Uses unwind!
+        if (CheckUnwind && (isa<UnwindInst>(BB->getTerminator()) ||
+                            isa<ResumeInst>(BB->getTerminator()))) {
+          // Uses unwind / resume!
           SCCMightUnwind = true;
         } else if (CheckReturn && isa<ReturnInst>(BB->getTerminator())) {
           SCCMightReturn = true;
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 0fbaff1..b5caa9a 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -180,7 +180,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
 
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *STy = StructTypes[i];
-    if (STy->isAnonymous() || STy->getName().empty()) continue;
+    if (STy->isLiteral() || STy->getName().empty()) continue;
     
     if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
       continue;
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index d070ccc..a46d5ad 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -13,3 +13,11 @@ add_llvm_library(LLVMInstCombine
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
   )
+
+add_llvm_library_dependencies(LLVMInstCombine
+  LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 8257d6b..3808278 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -11,6 +11,7 @@
 #define INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -103,7 +104,7 @@ public:
   //
   Instruction *visitAdd(BinaryOperator &I);
   Instruction *visitFAdd(BinaryOperator &I);
-  Value *OptimizePointerDifference(Value *LHS, Value *RHS, const Type *Ty);
+  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
@@ -192,15 +193,16 @@ public:
   Instruction *visitExtractElementInst(ExtractElementInst &EI);
   Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
   Instruction *visitExtractValueInst(ExtractValueInst &EV);
+  Instruction *visitLandingPadInst(LandingPadInst &LI);
 
   // visitInstruction - Specify what to return for unhandled instructions...
   Instruction *visitInstruction(Instruction &I) { return 0; }
 
 private:
-  bool ShouldChangeType(const Type *From, const Type *To) const;
+  bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V) const;
-  const Type *FindElementAtOffset(const Type *Ty, int64_t Offset, 
+  Type *FindElementAtOffset(Type *Ty, int64_t Offset, 
                                   SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
                                  
@@ -209,12 +211,13 @@ private:
   /// the cast can be eliminated by some other simple transformation, we prefer
   /// to do the simplification first.
   bool ShouldOptimizeCast(Instruction::CastOps opcode,const Value *V,
-                          const Type *Ty);
+                          Type *Ty);
 
   Instruction *visitCallSite(CallSite CS);
   Instruction *tryOptimizeCall(CallInst *CI, const TargetData *TD);
   bool transformConstExprCastCall(CallSite CS);
-  Instruction *transformCallThroughTrampoline(CallSite CS);
+  Instruction *transformCallThroughTrampoline(CallSite CS,
+                                              IntrinsicInst *Tramp);
   Instruction *transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                  bool DoXform = true);
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
@@ -357,7 +360,7 @@ private:
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
 
-  Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned);
+  Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
 };
 
       
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c36a955..d10046c 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -188,7 +188,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     return BinaryOperator::CreateMul(LHS, AddOne(C2));
 
   // A+B --> A|B iff A and B have no bits set in common.
-  if (const IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
+  if (IntegerType *IT = dyn_cast<IntegerType>(I.getType())) {
     APInt Mask = APInt::getAllOnesValue(IT->getBitWidth());
     APInt LHSKnownOne(IT->getBitWidth(), 0);
     APInt LHSKnownZero(IT->getBitWidth(), 0);
@@ -401,7 +401,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
   TargetData &TD = *getTargetData();
   gep_type_iterator GTI = gep_type_begin(GEP);
-  const Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
+  Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
   Value *Result = Constant::getNullValue(IntPtrTy);
 
   // If the GEP is inbounds, we know that none of the addressing operations will
@@ -420,7 +420,7 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
       if (OpC->isZero()) continue;
       
       // Handle a struct index, which adds its field offset to the pointer.
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
         
         if (Size)
@@ -460,7 +460,7 @@ Value *InstCombiner::EmitGEPOffset(User *GEP) {
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
 ///
 Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
-                                               const Type *Ty) {
+                                               Type *Ty) {
   assert(TD && "Must have target data info for this");
   
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 64ea36f..5e0bfe8 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1174,30 +1174,31 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         ((A == C && B == D) || (A == D && B == C)))
       return BinaryOperator::CreateXor(A, B);
     
-    if (Op0->hasOneUse() &&
-        match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
-      if (A == Op1) {                                // (A^B)&A -> A&(A^B)
-        I.swapOperands();     // Simplify below
-        std::swap(Op0, Op1);
-      } else if (B == Op1) {                         // (A^B)&B -> B&(B^A)
-        cast<BinaryOperator>(Op0)->swapOperands();
-        I.swapOperands();     // Simplify below
-        std::swap(Op0, Op1);
+    // A&(A^B) => A & ~B
+    {
+      Value *tmpOp0 = Op0;
+      Value *tmpOp1 = Op1;
+      if (Op0->hasOneUse() &&
+          match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+        if (A == Op1 || B == Op1 ) {
+          tmpOp1 = Op0;
+          tmpOp0 = Op1;
+          // Simplify below
+        }
       }
-    }
 
-    if (Op1->hasOneUse() &&
-        match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
-      if (B == Op0) {                                // B&(A^B) -> B&(B^A)
-        cast<BinaryOperator>(Op1)->swapOperands();
-        std::swap(A, B);
+      if (tmpOp1->hasOneUse() &&
+          match(tmpOp1, m_Xor(m_Value(A), m_Value(B)))) {
+        if (B == tmpOp0) {
+          std::swap(A, B);
+        }
+        // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if
+        // A is originally -1 (or a vector of -1 and undefs), then we enter
+        // an endless loop. By checking that A is non-constant we ensure that
+        // we will never get to the loop.
+        if (A == tmpOp0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B
+          return BinaryOperator::CreateAnd(A, Builder->CreateNot(B));
       }
-      // Notice that the patten (A&(~B)) is actually (A&(-1^B)), so if
-      // A is originally -1 (or a vector of -1 and undefs), then we enter
-      // an endless loop. By checking that A is non-constant we ensure that
-      // we will never get to the loop.
-      if (A == Op0 && !isa<Constant>(A)) // A&(A^B) -> A & ~B
-        return BinaryOperator::CreateAnd(A, Builder->CreateNot(B, "tmp"));
     }
 
     // (A&((~A)|B)) -> A&B
@@ -1224,7 +1225,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
   // fold (and (cast A), (cast B)) -> (cast (and A, B))
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
     if (CastInst *Op1C = dyn_cast<CastInst>(Op1)) {
-      const Type *SrcTy = Op0C->getOperand(0)->getType();
+      Type *SrcTy = Op0C->getOperand(0)->getType();
       if (Op0C->getOpcode() == Op1C->getOpcode() && // same cast kind ?
           SrcTy == Op1C->getOperand(0)->getType() &&
           SrcTy->isIntOrIntVectorTy()) {
@@ -2008,7 +2009,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
     CastInst *Op1C = dyn_cast<CastInst>(Op1);
     if (Op1C && Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ?
-      const Type *SrcTy = Op0C->getOperand(0)->getType();
+      Type *SrcTy = Op0C->getOperand(0)->getType();
       if (SrcTy == Op1C->getOperand(0)->getType() &&
           SrcTy->isIntOrIntVectorTy()) {
         Value *Op0COp = Op0C->getOperand(0), *Op1COp = Op1C->getOperand(0);
@@ -2227,14 +2228,14 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       if (A == Op1)                                  // (B|A)^B == (A|B)^B
         std::swap(A, B);
       if (B == Op1)                                  // (A|B)^B == A & ~B
-        return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1, "tmp"));
+        return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1));
     } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && 
                Op0I->hasOneUse()){
       if (A == Op1)                                        // (A&B)^A -> (B&A)^A
         std::swap(A, B);
       if (B == Op1 &&                                      // (B&A)^A == ~B & A
           !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
-        return BinaryOperator::CreateAnd(Builder->CreateNot(A, "tmp"), Op1);
+        return BinaryOperator::CreateAnd(Builder->CreateNot(A), Op1);
       }
     }
   }
@@ -2288,7 +2289,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
   if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
     if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
       if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind?
-        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        Type *SrcTy = Op0C->getOperand(0)->getType();
         if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isIntegerTy() &&
             // Only do this if the casts both really cause code to be generated.
             ShouldOptimizeCast(Op0C->getOpcode(), Op0C->getOperand(0), 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 537f2b3..c7b3ff8 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
@@ -22,8 +21,8 @@ using namespace llvm;
 
 /// getPromotedType - Return the specified type promoted as it would be to pass
 /// though a va_arg area.
-static const Type *getPromotedType(const Type *Ty) {
-  if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+static Type *getPromotedType(Type *Ty) {
+  if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
     if (ITy->getBitWidth() < 32)
       return Type::getInt32Ty(Ty->getContext());
   }
@@ -64,7 +63,7 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAddrSp =
     cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
 
-  const IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
+  IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
   Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
   Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
   
@@ -76,18 +75,18 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // integer datatype.
   Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
   if (StrippedDest != MI->getArgOperand(0)) {
-    const Type *SrcETy = cast<PointerType>(StrippedDest->getType())
+    Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                     ->getElementType();
     if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
       // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
       // down through these levels if so.
       while (!SrcETy->isSingleValueType()) {
-        if (const StructType *STy = dyn_cast<StructType>(SrcETy)) {
+        if (StructType *STy = dyn_cast<StructType>(SrcETy)) {
           if (STy->getNumElements() == 1)
             SrcETy = STy->getElementType(0);
           else
             break;
-        } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
+        } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
           if (ATy->getNumElements() == 1)
             SrcETy = ATy->getElementType();
           else
@@ -142,7 +141,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
   
   // memset(s,c,n) -> store s, c (for n=1,2,4,8)
   if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
-    const Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
+    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
     
     Value *Dest = MI->getDest();
     unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
@@ -250,7 +249,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // We need target data for just about everything so depend on it.
     if (!TD) break;
     
-    const Type *ReturnTy = CI.getType();
+    Type *ReturnTy = CI.getType();
     uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL;
 
     // Get to the real allocated thing and offset as fast as possible.
@@ -266,8 +265,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // Get the current byte offset into the thing. Use the original
       // operand in case we're looking through a bitcast.
       SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end());
-      Offset = TD->getIndexedOffset(GEP->getPointerOperandType(),
-                                    Ops.data(), Ops.size());
+      Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops);
 
       Op1 = GEP->getPointerOperand()->stripPointerCasts();
 
@@ -300,7 +298,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       }
     } else if (CallInst *MI = extractMallocCall(Op1)) {
       // Get allocation size.
-      const Type* MallocType = getMallocAllocatedType(MI);
+      Type* MallocType = getMallocAllocatedType(MI);
       if (MallocType && MallocType->isSized())
         if (Value *NElems = getMallocArraySize(MI, TD, true))
           if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
@@ -355,7 +353,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::cttz: {
     // If all bits below the first known one are known zero,
     // this value is constant.
-    const IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
+    IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
     // FIXME: Try to simplify vectors of integers.
     if (!IT) break;
     uint32_t BitWidth = IT->getBitWidth();
@@ -374,7 +372,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ctlz: {
     // If all bits above the first known one are known zero,
     // this value is constant.
-    const IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
+    IntegerType *IT = dyn_cast<IntegerType>(II->getArgOperand(0)->getType());
     // FIXME: Try to simplify vectors of integers.
     if (!IT) break;
     uint32_t BitWidth = IT->getBitWidth();
@@ -392,7 +390,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::uadd_with_overflow: {
     Value *LHS = II->getArgOperand(0), *RHS = II->getArgOperand(1);
-    const IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
+    IntegerType *IT = cast<IntegerType>(II->getArgOperand(0)->getType());
     uint32_t BitWidth = IT->getBitWidth();
     APInt Mask = APInt::getSignBit(BitWidth);
     APInt LHSKnownZero(BitWidth, 0);
@@ -416,7 +414,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           UndefValue::get(LHS->getType()),
           ConstantInt::getTrue(II->getContext())
         };
-        const StructType *ST = cast<StructType>(II->getType());
+        StructType *ST = cast<StructType>(II->getType());
         Constant *Struct = ConstantStruct::get(ST, V);
         return InsertValueInst::Create(Struct, Add, 0);
       }
@@ -430,7 +428,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           UndefValue::get(LHS->getType()),
           ConstantInt::getFalse(II->getContext())
         };
-        const StructType *ST = cast<StructType>(II->getType());
+        StructType *ST = cast<StructType>(II->getType());
         Constant *Struct = ConstantStruct::get(ST, V);
         return InsertValueInst::Create(Struct, Add, 0);
       }
@@ -559,7 +557,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, TD) >= 16) {
-      const Type *OpPtrTy = 
+      Type *OpPtrTy = 
         PointerType::getUnqual(II->getArgOperand(0)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(1), OpPtrTy);
       return new StoreInst(II->getArgOperand(0), Ptr);
@@ -570,7 +568,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse2_storeu_dq:
     // Turn X86 storeu -> store if the pointer is known aligned.
     if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, TD) >= 16) {
-      const Type *OpPtrTy = 
+      Type *OpPtrTy = 
         PointerType::getUnqual(II->getArgOperand(1)->getType());
       Value *Ptr = Builder->CreateBitCast(II->getArgOperand(0), OpPtrTy);
       return new StoreInst(II->getArgOperand(1), Ptr);
@@ -656,15 +654,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           
           if (ExtractedElts[Idx] == 0) {
             ExtractedElts[Idx] = 
-              Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1, 
-                  ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                   Idx&15, false), "tmp");
+              Builder->CreateExtractElement(Idx < 16 ? Op0 : Op1,
+                                            Builder->getInt32(Idx&15));
           }
         
           // Insert this value into the result vector.
           Result = Builder->CreateInsertElement(Result, ExtractedElts[Idx],
-                         ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                          i, false), "tmp");
+                                                Builder->getInt32(i));
         }
         return CastInst::Create(Instruction::BitCast, Result, CI.getType());
       }
@@ -733,9 +729,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       }
     }
     
-    // If the stack restore is in a return/unwind block and if there are no
-    // allocas or calls between the restore and the return, nuke the restore.
-    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<UnwindInst>(TI)))
+    // If the stack restore is in a return, resume, or unwind block and if there
+    // are no allocas or calls between the restore and the return, nuke the
+    // restore.
+    if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI) ||
+                          isa<UnwindInst>(TI)))
       return EraseInstFromFunction(CI);
     break;
   }
@@ -765,9 +763,9 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   if (!CS.paramHasAttr(ix, Attribute::ByVal))
     return true;
 
-  const Type* SrcTy = 
+  Type* SrcTy = 
             cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
-  const Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
+  Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
   if (!SrcTy->isSized() || !DstTy->isSized())
     return false;
   if (!TD || TD->getTypeAllocSize(SrcTy) != TD->getTypeAllocSize(DstTy))
@@ -820,6 +818,83 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) {
   return Simplifier.NewInstruction;
 }
 
+static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
+  // Strip off at most one level of pointer casts, looking for an alloca.  This
+  // is good enough in practice and simpler than handling any number of casts.
+  Value *Underlying = TrampMem->stripPointerCasts();
+  if (Underlying != TrampMem &&
+      (!Underlying->hasOneUse() || *Underlying->use_begin() != TrampMem))
+    return 0;
+  if (!isa<AllocaInst>(Underlying))
+    return 0;
+
+  IntrinsicInst *InitTrampoline = 0;
+  for (Value::use_iterator I = TrampMem->use_begin(), E = TrampMem->use_end();
+       I != E; I++) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(*I);
+    if (!II)
+      return 0;
+    if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
+      if (InitTrampoline)
+        // More than one init_trampoline writes to this value.  Give up.
+        return 0;
+      InitTrampoline = II;
+      continue;
+    }
+    if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
+      // Allow any number of calls to adjust.trampoline.
+      continue;
+    return 0;
+  }
+
+  // No call to init.trampoline found.
+  if (!InitTrampoline)
+    return 0;
+
+  // Check that the alloca is being used in the expected way.
+  if (InitTrampoline->getOperand(0) != TrampMem)
+    return 0;
+
+  return InitTrampoline;
+}
+
+static IntrinsicInst *FindInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
+                                               Value *TrampMem) {
+  // Visit all the previous instructions in the basic block, and try to find a
+  // init.trampoline which has a direct path to the adjust.trampoline.
+  for (BasicBlock::iterator I = AdjustTramp,
+       E = AdjustTramp->getParent()->begin(); I != E; ) {
+    Instruction *Inst = --I;
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+      if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
+          II->getOperand(0) == TrampMem)
+        return II;
+    if (Inst->mayWriteToMemory())
+      return 0;
+  }
+  return 0;
+}
+
+// Given a call to llvm.adjust.trampoline, find and return the corresponding
+// call to llvm.init.trampoline if the call to the trampoline can be optimized
+// to a direct call to a function.  Otherwise return NULL.
+//
+static IntrinsicInst *FindInitTrampoline(Value *Callee) {
+  Callee = Callee->stripPointerCasts();
+  IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
+  if (!AdjustTramp ||
+      AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
+    return 0;
+
+  Value *TrampMem = AdjustTramp->getOperand(0);
+
+  if (IntrinsicInst *IT = FindInitTrampolineFromAlloca(TrampMem))
+    return IT;
+  if (IntrinsicInst *IT = FindInitTrampolineFromBB(AdjustTramp, TrampMem))
+    return IT;
+  return 0;
+}
+
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
@@ -879,13 +954,11 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     return EraseInstFromFunction(*CS.getInstruction());
   }
 
-  if (BitCastInst *BC = dyn_cast<BitCastInst>(Callee))
-    if (IntrinsicInst *In = dyn_cast<IntrinsicInst>(BC->getOperand(0)))
-      if (In->getIntrinsicID() == Intrinsic::init_trampoline)
-        return transformCallThroughTrampoline(CS);
+  if (IntrinsicInst *II = FindInitTrampoline(Callee))
+    return transformCallThroughTrampoline(CS, II);
 
-  const PointerType *PTy = cast<PointerType>(Callee->getType());
-  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
   if (FTy->isVarArg()) {
     int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1);
     // See if we can optimize any arguments passed through the varargs area of
@@ -934,9 +1007,9 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // would cause a type conversion of one of our arguments, change this call to
   // be a direct call with arguments casted to the appropriate types.
   //
-  const FunctionType *FT = Callee->getFunctionType();
-  const Type *OldRetTy = Caller->getType();
-  const Type *NewRetTy = FT->getReturnType();
+  FunctionType *FT = Callee->getFunctionType();
+  Type *OldRetTy = Caller->getType();
+  Type *NewRetTy = FT->getReturnType();
 
   if (NewRetTy->isStructTy())
     return false; // TODO: Handle multiple return values.
@@ -982,8 +1055,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   CallSite::arg_iterator AI = CS.arg_begin();
   for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
-    const Type *ParamTy = FT->getParamType(i);
-    const Type *ActTy = (*AI)->getType();
+    Type *ParamTy = FT->getParamType(i);
+    Type *ActTy = (*AI)->getType();
 
     if (!CastInst::isCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
@@ -995,11 +1068,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
     if (ParamTy != ActTy && (Attrs & Attribute::ByVal)) {
-      const PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
+      PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
       
-      const Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
+      Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
       if (TD->getTypeAllocSize(CurElTy) !=
           TD->getTypeAllocSize(ParamPTy->getElementType()))
         return false;
@@ -1023,7 +1096,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     // If the callee is just a declaration, don't change the varargsness of the
     // call.  We don't want to introduce a varargs call where one doesn't
     // already exist.
-    const PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
+    PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
     if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
       return false;
   }
@@ -1062,13 +1135,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
-    const Type *ParamTy = FT->getParamType(i);
+    Type *ParamTy = FT->getParamType(i);
     if ((*AI)->getType() == ParamTy) {
       Args.push_back(*AI);
     } else {
       Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
           false, ParamTy, false);
-      Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy, "tmp"));
+      Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy));
     }
 
     // Add any parameter attributes.
@@ -1089,12 +1162,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     } else {
       // Add all of the arguments in their promoted form to the arg list.
       for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
-        const Type *PTy = getPromotedType((*AI)->getType());
+        Type *PTy = getPromotedType((*AI)->getType());
         if (PTy != (*AI)->getType()) {
           // Must promote to pass through va_arg area!
           Instruction::CastOps opcode =
             CastInst::getCastOpcode(*AI, false, PTy, false);
-          Args.push_back(Builder->CreateCast(opcode, *AI, PTy, "tmp"));
+          Args.push_back(Builder->CreateCast(opcode, *AI, PTy));
         } else {
           Args.push_back(*AI);
         }
@@ -1138,13 +1211,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!NV->getType()->isVoidTy()) {
       Instruction::CastOps opcode =
         CastInst::getCastOpcode(NC, false, OldRetTy, false);
-      NV = NC = CastInst::Create(opcode, NC, OldRetTy, "tmp");
+      NV = NC = CastInst::Create(opcode, NC, OldRetTy);
       NC->setDebugLoc(Caller->getDebugLoc());
 
       // If this is an invoke instruction, we should insert it after the first
       // non-phi, instruction in the normal successor block.
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
-        BasicBlock::iterator I = II->getNormalDest()->getFirstNonPHI();
+        BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
         InsertNewInstBefore(NC, *I);
       } else {
         // Otherwise, it's a call, just insert cast right after the call.
@@ -1163,13 +1236,16 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   return true;
 }
 
-// transformCallThroughTrampoline - Turn a call to a function created by the
-// init_trampoline intrinsic into a direct call to the underlying function.
+// transformCallThroughTrampoline - Turn a call to a function created by
+// init_trampoline / adjust_trampoline intrinsic pair into a direct call to the
+// underlying function.
 //
-Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
+Instruction *
+InstCombiner::transformCallThroughTrampoline(CallSite CS,
+                                             IntrinsicInst *Tramp) {
   Value *Callee = CS.getCalledValue();
-  const PointerType *PTy = cast<PointerType>(Callee->getType());
-  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
   const AttrListPtr &Attrs = CS.getAttributes();
 
   // If the call already has the 'nest' attribute somewhere then give up -
@@ -1177,12 +1253,12 @@ Instruction *InstCombiner::transformCallThroughTrampoline(CallSite CS) {
   if (Attrs.hasAttrSomewhere(Attribute::Nest))
     return 0;
 
-  IntrinsicInst *Tramp =
-    cast<IntrinsicInst>(cast<BitCastInst>(Callee)->getOperand(0));
+  assert(Tramp &&
+         "transformCallThroughTrampoline called with incorrect CallSite.");
 
   Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
-  const PointerType *NestFPTy = cast<PointerType>(NestF->getType());
-  const FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
+  PointerType *NestFPTy = cast<PointerType>(NestF->getType());
+  FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
 
   const AttrListPtr &NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 82c734e..f10e48a 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
@@ -79,14 +80,14 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   // This requires TargetData to get the alloca alignment and size information.
   if (!TD) return 0;
 
-  const PointerType *PTy = cast<PointerType>(CI.getType());
+  PointerType *PTy = cast<PointerType>(CI.getType());
   
   BuilderTy AllocaBuilder(*Builder);
   AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
 
   // Get the type really allocated and the type casted to.
-  const Type *AllocElTy = AI.getAllocatedType();
-  const Type *CastElTy = PTy->getElementType();
+  Type *AllocElTy = AI.getAllocatedType();
+  Type *CastElTy = PTy->getElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0;
 
   unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy);
@@ -121,13 +122,13 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   } else {
     Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
     // Insert before the alloca, not before the cast.
-    Amt = AllocaBuilder.CreateMul(Amt, NumElements, "tmp");
+    Amt = AllocaBuilder.CreateMul(Amt, NumElements);
   }
   
   if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
-    Amt = AllocaBuilder.CreateAdd(Amt, Off, "tmp");
+    Amt = AllocaBuilder.CreateAdd(Amt, Off);
   }
   
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
@@ -151,7 +152,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
 /// EvaluateInDifferentType - Given an expression that 
 /// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
 /// insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, 
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, 
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
@@ -229,12 +230,12 @@ static Instruction::CastOps
 isEliminableCastPair(
   const CastInst *CI, ///< The first cast instruction
   unsigned opcode,       ///< The opcode of the second cast instruction
-  const Type *DstTy,     ///< The target type for the second cast instruction
+  Type *DstTy,     ///< The target type for the second cast instruction
   TargetData *TD         ///< The target data for pointer size
 ) {
 
-  const Type *SrcTy = CI->getOperand(0)->getType();   // A from above
-  const Type *MidTy = CI->getType();                  // B from above
+  Type *SrcTy = CI->getOperand(0)->getType();   // A from above
+  Type *MidTy = CI->getType();                  // B from above
 
   // Get the opcodes of the two Cast instructions
   Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
@@ -260,7 +261,7 @@ isEliminableCastPair(
 /// the cast can be eliminated by some other simple transformation, we prefer
 /// to do the simplification first.
 bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
-                                      const Type *Ty) {
+                                      Type *Ty) {
   // Noop casts and casts of constants should be eliminated trivially.
   if (V->getType() == Ty || isa<Constant>(V)) return false;
   
@@ -324,7 +325,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateTruncated(Value *V, const Type *Ty) {
+static bool CanEvaluateTruncated(Value *V, Type *Ty) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
     return true;
@@ -332,7 +333,7 @@ static bool CanEvaluateTruncated(Value *V, const Type *Ty) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
   
-  const Type *OrigTy = V->getType();
+  Type *OrigTy = V->getType();
   
   // If this is an extension from the dest type, we can eliminate it, even if it
   // has multiple uses.
@@ -435,7 +436,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     return &CI;
   
   Value *Src = CI.getOperand(0);
-  const Type *DestTy = CI.getType(), *SrcTy = Src->getType();
+  Type *DestTy = CI.getType(), *SrcTy = Src->getType();
   
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -456,7 +457,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // Canonicalize trunc x to i1 -> (icmp ne (and x, 1), 0), likewise for vector.
   if (DestTy->getScalarSizeInBits() == 1) {
     Constant *One = ConstantInt::get(Src->getType(), 1);
-    Src = Builder->CreateAnd(Src, One, "tmp");
+    Src = Builder->CreateAnd(Src, One);
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
@@ -518,7 +519,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
                                    In->getType()->getScalarSizeInBits()-1);
       In = Builder->CreateLShr(In, Sh, In->getName()+".lobit");
       if (In->getType() != CI.getType())
-        In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/, "tmp");
+        In = Builder->CreateIntCast(In, CI.getType(), false/*ZExt*/);
 
       if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
         Constant *One = ConstantInt::get(In->getType(), 1);
@@ -572,7 +573,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
-          In = Builder->CreateXor(In, One, "tmp");
+          In = Builder->CreateXor(In, One);
         }
           
         if (CI.getType() == In->getType())
@@ -586,7 +587,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   // It is also profitable to transform icmp eq into not(xor(A, B)) because that
   // may lead to additional simplifications.
   if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
-    if (const IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
+    if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
       uint32_t BitWidth = ITy->getBitWidth();
       Value *LHS = ICI->getOperand(0);
       Value *RHS = ICI->getOperand(1);
@@ -644,7 +645,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
 /// clear the top bits anyway, doing this has no extra cost.
 ///
 /// This function works on both vectors and scalars.
-static bool CanEvaluateZExtd(Value *V, const Type *Ty, unsigned &BitsToClear) {
+static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   BitsToClear = 0;
   if (isa<Constant>(V))
     return true;
@@ -758,7 +759,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     return &CI;
   
   Value *Src = CI.getOperand(0);
-  const Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
   
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -820,7 +821,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
                                                            AndValue));
     }
     if (SrcSize > DstSize) {
-      Value *Trunc = Builder->CreateTrunc(A, CI.getType(), "tmp");
+      Value *Trunc = Builder->CreateTrunc(A, CI.getType());
       APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
       return BinaryOperator::CreateAnd(Trunc, 
                                        ConstantInt::get(Trunc->getType(),
@@ -867,7 +868,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
             Value *TI0 = TI->getOperand(0);
             if (TI0->getType() == CI.getType()) {
               Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
-              Value *NewAnd = Builder->CreateAnd(TI0, ZC, "tmp");
+              Value *NewAnd = Builder->CreateAnd(TI0, ZC);
               return BinaryOperator::CreateXor(NewAnd, ZC);
             }
           }
@@ -900,7 +901,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
                                    Op0->getType()->getScalarSizeInBits()-1);
       Value *In = Builder->CreateAShr(Op0, Sh, Op0->getName()+".lobit");
       if (In->getType() != CI.getType())
-        In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/, "tmp");
+        In = Builder->CreateIntCast(In, CI.getType(), true/*SExt*/);
 
       if (Pred == ICmpInst::ICMP_SGT)
         In = Builder->CreateNot(In, In->getName()+".not");
@@ -965,10 +966,10 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
   }
 
   // vector (x <s 0) ? -1 : 0 -> ashr x, 31   -> all ones if signed.
-  if (const VectorType *VTy = dyn_cast<VectorType>(CI.getType())) {
+  if (VectorType *VTy = dyn_cast<VectorType>(CI.getType())) {
     if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_Zero()) &&
         Op0->getType() == CI.getType()) {
-      const Type *EltTy = VTy->getElementType();
+      Type *EltTy = VTy->getElementType();
 
       // splat the shift constant to a constant vector.
       Constant *VSh = ConstantInt::get(VTy, EltTy->getScalarSizeInBits()-1);
@@ -988,7 +989,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
 ///
 /// This function works on both vectors and scalars.
 ///
-static bool CanEvaluateSExtd(Value *V, const Type *Ty) {
+static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
          "Can't sign extend type to a smaller type");
   // If this is a constant, it can be trivially promoted.
@@ -1063,7 +1064,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     return &CI;
   
   Value *Src = CI.getOperand(0);
-  const Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+  Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
@@ -1192,7 +1193,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     case Instruction::FMul:
     case Instruction::FDiv:
     case Instruction::FRem:
-      const Type *SrcTy = OpI->getType();
+      Type *SrcTy = OpI->getType();
       Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
       Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
       if (LHSTrunc->getType() != SrcTy && 
@@ -1306,13 +1307,13 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
     if (CI.getOperand(0)->getType()->getScalarSizeInBits() >
         TD->getPointerSizeInBits()) {
       Value *P = Builder->CreateTrunc(CI.getOperand(0),
-                                      TD->getIntPtrType(CI.getContext()), "tmp");
+                                      TD->getIntPtrType(CI.getContext()));
       return new IntToPtrInst(P, CI.getType());
     }
     if (CI.getOperand(0)->getType()->getScalarSizeInBits() <
         TD->getPointerSizeInBits()) {
       Value *P = Builder->CreateZExt(CI.getOperand(0),
-                                     TD->getIntPtrType(CI.getContext()), "tmp");
+                                     TD->getIntPtrType(CI.getContext()));
       return new IntToPtrInst(P, CI.getType());
     }
   }
@@ -1351,7 +1352,7 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       
       // Get the base pointer input of the bitcast, and the type it points to.
       Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
-      const Type *GEPIdxTy =
+      Type *GEPIdxTy =
       cast<PointerType>(OrigBase->getType())->getElementType();
       SmallVector<Value*, 8> NewIndices;
       if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices)) {
@@ -1359,9 +1360,8 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
         Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ?
-        Builder->CreateInBoundsGEP(OrigBase,
-                                   NewIndices.begin(), NewIndices.end()) :
-        Builder->CreateGEP(OrigBase, NewIndices.begin(), NewIndices.end());
+        Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
+        Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
         
         if (isa<BitCastInst>(CI))
@@ -1382,14 +1382,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   if (TD) {
     if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits()) {
       Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()),
-                                         "tmp");
+                                         TD->getIntPtrType(CI.getContext()));
       return new TruncInst(P, CI.getType());
     }
     if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits()) {
       Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()),
-                                         "tmp");
+                                         TD->getIntPtrType(CI.getContext()));
       return new ZExtInst(P, CI.getType());
     }
   }
@@ -1402,12 +1400,12 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
 /// replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
+static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
                                          InstCombiner &IC) {
   // We can only do this optimization if the output is a multiple of the input
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
-  const VectorType *SrcTy = cast<VectorType>(InVal->getType());
+  VectorType *SrcTy = cast<VectorType>(InVal->getType());
   
   if (SrcTy->getElementType() != DestTy->getElementType()) {
     // The input types don't need to be identical, but for now they must be the
@@ -1427,7 +1425,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
   // size of the input.
   SmallVector<Constant*, 16> ShuffleMask;
   Value *V2;
-  const IntegerType *Int32Ty = Type::getInt32Ty(SrcTy->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(SrcTy->getContext());
   
   if (SrcTy->getNumElements() > DestTy->getNumElements()) {
     // If we're shrinking the number of elements, just shuffle in the low
@@ -1453,11 +1451,11 @@ static Instruction *OptimizeVectorResize(Value *InVal, const VectorType *DestTy,
   return new ShuffleVectorInst(InVal, V2, ConstantVector::get(ShuffleMask));
 }
 
-static bool isMultipleOfTypeSize(unsigned Value, const Type *Ty) {
+static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
   return Value % Ty->getPrimitiveSizeInBits() == 0;
 }
 
-static unsigned getTypeSizeIndex(unsigned Value, const Type *Ty) {
+static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
   return Value / Ty->getPrimitiveSizeInBits();
 }
 
@@ -1471,7 +1469,7 @@ static unsigned getTypeSizeIndex(unsigned Value, const Type *Ty) {
 /// filling in Elements with the elements found here.
 static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                      SmallVectorImpl<Value*> &Elements,
-                                     const Type *VecEltTy) {
+                                     Type *VecEltTy) {
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
   
@@ -1508,7 +1506,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
       C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
                                        C->getType()->getPrimitiveSizeInBits()));
     unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
-    const Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+    Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
     
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
@@ -1572,7 +1570,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
-  const VectorType *DestVecTy = cast<VectorType>(CI.getType());
+  VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
@@ -1599,7 +1597,7 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// bitcast.  The various long double bitcasts can't get in here.
 static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   Value *Src = CI.getOperand(0);
-  const Type *DestTy = CI.getType();
+  Type *DestTy = CI.getType();
 
   // If this is a bitcast from int to float, check to see if the int is an
   // extraction from a vector.
@@ -1607,7 +1605,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   // bitcast(trunc(bitcast(somevector)))
   if (match(Src, m_Trunc(m_BitCast(m_Value(VecInput)))) &&
       isa<VectorType>(VecInput->getType())) {
-    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    VectorType *VecTy = cast<VectorType>(VecInput->getType());
     unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
 
     if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0) {
@@ -1628,7 +1626,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
   if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
                                 m_ConstantInt(ShAmt)))) &&
       isa<VectorType>(VecInput->getType())) {
-    const VectorType *VecTy = cast<VectorType>(VecInput->getType());
+    VectorType *VecTy = cast<VectorType>(VecInput->getType());
     unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
     if (VecTy->getPrimitiveSizeInBits() % DestWidth == 0 &&
         ShAmt->getZExtValue() % DestWidth == 0) {
@@ -1651,18 +1649,18 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   // If the operands are integer typed then apply the integer transforms,
   // otherwise just apply the common ones.
   Value *Src = CI.getOperand(0);
-  const Type *SrcTy = Src->getType();
-  const Type *DestTy = CI.getType();
+  Type *SrcTy = Src->getType();
+  Type *DestTy = CI.getType();
 
   // Get rid of casts from one type to the same type. These are useless and can
   // be replaced by the operand.
   if (DestTy == Src->getType())
     return ReplaceInstUsesWith(CI, Src);
 
-  if (const PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
-    const PointerType *SrcPTy = cast<PointerType>(SrcTy);
-    const Type *DstElTy = DstPTy->getElementType();
-    const Type *SrcElTy = SrcPTy->getElementType();
+  if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+    PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    Type *DstElTy = DstPTy->getElementType();
+    Type *SrcElTy = SrcPTy->getElementType();
     
     // If the address spaces don't match, don't eliminate the bitcast, which is
     // required for changing types.
@@ -1693,7 +1691,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // If we found a path from the src to dest, create the getelementptr now.
     if (SrcElTy == DstElTy) {
       SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
-      return GetElementPtrInst::CreateInBounds(Src, Idxs.begin(), Idxs.end());
+      return GetElementPtrInst::CreateInBounds(Src, Idxs);
     }
   }
   
@@ -1702,7 +1700,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
       return I;
 
-  if (const VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
+  if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
     if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
       Value *Elem = Builder->CreateBitCast(Src, DestVTy->getElementType());
       return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
@@ -1731,7 +1729,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (const VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
+  if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
     if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) {
       Value *Elem = 
         Builder->CreateExtractElement(Src,
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index c78760b..bb1cbfa 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Target/TargetData.h"
@@ -56,7 +57,7 @@ static bool AddWithOverflow(Constant *&Result, Constant *In1,
                             Constant *In2, bool IsSigned = false) {
   Result = ConstantExpr::getAdd(In1, In2);
 
-  if (const VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
       Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
       if (HasAddOverflow(ExtractElement(Result, Idx),
@@ -78,7 +79,7 @@ static bool HasSubOverflow(ConstantInt *Result,
                            bool IsSigned) {
   if (!IsSigned)
     return Result->getValue().ugt(In1->getValue());
-  
+
   if (In2->isNegative())
     return Result->getValue().slt(In1->getValue());
 
@@ -91,7 +92,7 @@ static bool SubWithOverflow(Constant *&Result, Constant *In1,
                             Constant *In2, bool IsSigned = false) {
   Result = ConstantExpr::getSub(In1, In2);
 
-  if (const VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
+  if (VectorType *VTy = dyn_cast<VectorType>(In1->getType())) {
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
       Constant *Idx = ConstantInt::get(Type::getInt32Ty(In1->getContext()), i);
       if (HasSubOverflow(ExtractElement(Result, Idx),
@@ -128,7 +129,7 @@ static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
     // True if LHS u> RHS and RHS == high-bit-mask - 1
     TrueIfSigned = true;
     return RHS->isMaxValue(true);
-  case ICmpInst::ICMP_UGE: 
+  case ICmpInst::ICMP_UGE:
     // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
     TrueIfSigned = true;
     return RHS->getValue().isSignBit();
@@ -143,7 +144,7 @@ static bool isHighOnes(const ConstantInt *CI) {
   return (~CI->getValue() + 1).isPowerOf2();
 }
 
-/// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a 
+/// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a
 /// set of known zero and one bits, compute the maximum and minimum values that
 /// could have the specified known zero and known one bits, returning them in
 /// min/max.
@@ -160,7 +161,7 @@ static void ComputeSignedMinMaxValuesFromKnownBits(const APInt& KnownZero,
   // bit if it is unknown.
   Min = KnownOne;
   Max = KnownOne|UnknownBits;
-  
+
   if (UnknownBits.isNegative()) { // Sign bit is unknown
     Min.setBit(Min.getBitWidth()-1);
     Max.clearBit(Max.getBitWidth()-1);
@@ -179,7 +180,7 @@ static void ComputeUnsignedMinMaxValuesFromKnownBits(const APInt &KnownZero,
          KnownZero.getBitWidth() == Max.getBitWidth() &&
          "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
   APInt UnknownBits = ~(KnownZero|KnownOne);
-  
+
   // The minimum value is when the unknown bits are all zeros.
   Min = KnownOne;
   // The maximum value is when the unknown bits are all ones.
@@ -201,10 +202,10 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
   // We need TD information to know the pointer size unless this is inbounds.
   if (!GEP->isInBounds() && TD == 0) return 0;
-  
+
   ConstantArray *Init = dyn_cast<ConstantArray>(GV->getInitializer());
   if (Init == 0 || Init->getNumOperands() > 1024) return 0;
-  
+
   // There are many forms of this optimization we can handle, for now, just do
   // the simple index into a single-dimensional array.
   //
@@ -219,31 +220,31 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // type they index.  Collect the indices.  This is typically for arrays of
   // structs.
   SmallVector<unsigned, 4> LaterIndices;
-  
-  const Type *EltTy = cast<ArrayType>(Init->getType())->getElementType();
+
+  Type *EltTy = cast<ArrayType>(Init->getType())->getElementType();
   for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
     ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
     if (Idx == 0) return 0;  // Variable index.
-    
+
     uint64_t IdxVal = Idx->getZExtValue();
     if ((unsigned)IdxVal != IdxVal) return 0; // Too large array index.
-    
-    if (const StructType *STy = dyn_cast<StructType>(EltTy))
+
+    if (StructType *STy = dyn_cast<StructType>(EltTy))
       EltTy = STy->getElementType(IdxVal);
-    else if (const ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
+    else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
       if (IdxVal >= ATy->getNumElements()) return 0;
       EltTy = ATy->getElementType();
     } else {
       return 0; // Unknown type.
     }
-    
+
     LaterIndices.push_back(IdxVal);
   }
-  
+
   enum { Overdefined = -3, Undefined = -2 };
 
   // Variables for our state machines.
-  
+
   // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
   // "i == 47 | i == 87", where 47 is the first index the condition is true for,
   // and 87 is the second (and last) index.  FirstTrueElement is -2 when
@@ -254,7 +255,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
   // form "i != 47 & i != 87".  Same state transitions as for true elements.
   int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
-  
+
   /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
   /// define a state machine that triggers for ranges of values that the index
   /// is true or false for.  This triggers on things like "abbbbc"[i] == 'b'.
@@ -262,25 +263,25 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   /// index in the range (inclusive).  We use -2 for undefined here because we
   /// use relative comparisons and don't want 0-1 to match -1.
   int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
-  
+
   // MagicBitvector - This is a magic bitvector where we set a bit if the
   // comparison is true for element 'i'.  If there are 64 elements or less in
   // the array, this will fully represent all the comparison results.
   uint64_t MagicBitvector = 0;
-  
-  
+
+
   // Scan the array and see if one of our patterns matches.
   Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
   for (unsigned i = 0, e = Init->getNumOperands(); i != e; ++i) {
     Constant *Elt = Init->getOperand(i);
-    
+
     // If this is indexing an array of structures, get the structure element.
     if (!LaterIndices.empty())
       Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
-    
+
     // If the element is masked, handle it.
     if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
-    
+
     // Find out if the comparison would be true or false for the i'th element.
     Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
                                                   CompareRHS, TD);
@@ -294,15 +295,15 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
         FalseRangeEnd = i;
       continue;
     }
-    
+
     // If we can't compute the result for any of the elements, we have to give
     // up evaluating the entire conditional.
     if (!isa<ConstantInt>(C)) return 0;
-    
+
     // Otherwise, we know if the comparison is true or false for this element,
     // update our state machines.
     bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
-    
+
     // State machine for single/double/range index comparison.
     if (IsTrueForElt) {
       // Update the TrueElement state machine.
@@ -314,7 +315,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
           SecondTrueElement = i;
         else
           SecondTrueElement = Overdefined;
-        
+
         // Update range state machine.
         if (TrueRangeEnd == (int)i-1)
           TrueRangeEnd = i;
@@ -331,7 +332,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
           SecondFalseElement = i;
         else
           SecondFalseElement = Overdefined;
-        
+
         // Update range state machine.
         if (FalseRangeEnd == (int)i-1)
           FalseRangeEnd = i;
@@ -339,12 +340,12 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
           FalseRangeEnd = Overdefined;
       }
     }
-    
-    
+
+
     // If this element is in range, update our magic bitvector.
     if (i < 64 && IsTrueForElt)
       MagicBitvector |= 1ULL << i;
-    
+
     // If all of our states become overdefined, bail out early.  Since the
     // predicate is expensive, only check it every 8 elements.  This is only
     // really useful for really huge arrays.
@@ -364,20 +365,20 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   if (!GEP->isInBounds() &&
       Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits())
     Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext()));
-  
+
   // If the comparison is only true for one or two elements, emit direct
   // comparisons.
   if (SecondTrueElement != Overdefined) {
     // None true -> false.
     if (FirstTrueElement == Undefined)
       return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext()));
-    
+
     Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
-    
+
     // True for one element -> 'i == 47'.
     if (SecondTrueElement == Undefined)
       return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
-    
+
     // True for two elements -> 'i == 47 | i == 72'.
     Value *C1 = Builder->CreateICmpEQ(Idx, FirstTrueIdx);
     Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
@@ -391,36 +392,36 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     // None false -> true.
     if (FirstFalseElement == Undefined)
       return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext()));
-    
+
     Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
 
     // False for one element -> 'i != 47'.
     if (SecondFalseElement == Undefined)
       return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
-     
+
     // False for two elements -> 'i != 47 & i != 72'.
     Value *C1 = Builder->CreateICmpNE(Idx, FirstFalseIdx);
     Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
     Value *C2 = Builder->CreateICmpNE(Idx, SecondFalseIdx);
     return BinaryOperator::CreateAnd(C1, C2);
   }
-  
+
   // If the comparison can be replaced with a range comparison for the elements
   // where it is true, emit the range check.
   if (TrueRangeEnd != Overdefined) {
     assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
-    
+
     // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
     if (FirstTrueElement) {
       Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
       Idx = Builder->CreateAdd(Idx, Offs);
     }
-    
+
     Value *End = ConstantInt::get(Idx->getType(),
                                   TrueRangeEnd-FirstTrueElement+1);
     return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
   }
-  
+
   // False range check.
   if (FalseRangeEnd != Overdefined) {
     assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
@@ -429,19 +430,19 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
       Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
       Idx = Builder->CreateAdd(Idx, Offs);
     }
-    
+
     Value *End = ConstantInt::get(Idx->getType(),
                                   FalseRangeEnd-FirstFalseElement);
     return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
   }
-  
-  
+
+
   // If a 32-bit or 64-bit magic bitvector captures the entire comparison state
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
   if (Init->getNumOperands() <= 32 ||
       (TD && Init->getNumOperands() <= 64 && TD->isLegalInteger(64))) {
-    const Type *Ty;
+    Type *Ty;
     if (Init->getNumOperands() <= 32)
       Ty = Type::getInt32Ty(Init->getContext());
     else
@@ -451,7 +452,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
     V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
     return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
   }
-  
+
   return 0;
 }
 
@@ -465,11 +466,11 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 /// to generate the first by knowing that pointer arithmetic doesn't overflow.
 ///
 /// If we can't emit an optimized form for this expression, this returns null.
-/// 
+///
 static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   TargetData &TD = *IC.getTargetData();
   gep_type_iterator GTI = gep_type_begin(GEP);
-  
+
   // Check to see if this gep only has a single variable index.  If so, and if
   // any constant indices are a multiple of its scale, then we can compute this
   // in terms of the scale of the variable index.  For example, if the GEP
@@ -481,9 +482,9 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
       // Compute the aggregate offset of constant indices.
       if (CI->isZero()) continue;
-      
+
       // Handle a struct index, which adds its field offset to the pointer.
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
       } else {
         uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
@@ -494,33 +495,33 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
       break;
     }
   }
-  
+
   // If there are no variable indices, we must have a constant offset, just
   // evaluate it the general way.
   if (i == e) return 0;
-  
+
   Value *VariableIdx = GEP->getOperand(i);
   // Determine the scale factor of the variable element.  For example, this is
   // 4 if the variable index is into an array of i32.
   uint64_t VariableScale = TD.getTypeAllocSize(GTI.getIndexedType());
-  
+
   // Verify that there are no other variable indices.  If so, emit the hard way.
   for (++i, ++GTI; i != e; ++i, ++GTI) {
     ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
     if (!CI) return 0;
-    
+
     // Compute the aggregate offset of constant indices.
     if (CI->isZero()) continue;
-    
+
     // Handle a struct index, which adds its field offset to the pointer.
-    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
       Offset += TD.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
     } else {
       uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType());
       Offset += Size*CI->getSExtValue();
     }
   }
-  
+
   // Okay, we know we have a single variable index, which must be a
   // pointer/array/vector index.  If there is no offset, life is simple, return
   // the index.
@@ -530,19 +531,19 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
-      const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
       VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
   }
-  
+
   // Otherwise, there is an index.  The computation we will do will be modulo
   // the pointer size, so get it.
   uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
-  
+
   Offset &= PtrSizeMask;
   VariableScale &= PtrSizeMask;
-  
+
   // To do this transformation, any constant index must be a multiple of the
   // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
   // but we can't evaluate "10 + 3*i" in terms of i.  Check that the offset is a
@@ -550,9 +551,9 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
   int64_t NewOffs = Offset / (int64_t)VariableScale;
   if (Offset != NewOffs*(int64_t)VariableScale)
     return 0;
-  
+
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
-  const Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
   if (VariableIdx->getType() != IntPtrTy)
     VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
                                             true /*Signed*/);
@@ -576,7 +577,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // know pointers can't overflow since the gep is inbounds.  See if we can
     // output an optimized form.
     Value *Offset = EvaluateGEPOffsetExpression(GEPLHS, *this);
-    
+
     // If not, synthesize the offset the hard way.
     if (Offset == 0)
       Offset = EmitGEPOffset(GEPLHS);
@@ -686,7 +687,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
     bool isTrue = ICmpInst::isTrueWhenEqual(Pred);
     return ReplaceInstUsesWith(ICI, ConstantInt::get(ICI.getType(), isTrue));
   }
-  
+
   // (X+4) == X -> false.
   if (Pred == ICmpInst::ICMP_EQ)
     return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext()));
@@ -698,22 +699,22 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
   // operators.
-  
+
   // (X+1) <u X        --> X >u (MAXUINT-1)        --> X == 255
   // (X+2) <u X        --> X >u (MAXUINT-2)        --> X > 253
   // (X+MAXUINT) <u X  --> X >u (MAXUINT-MAXUINT)  --> X != 0
   if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
-    Value *R = 
+    Value *R =
       ConstantExpr::getSub(ConstantInt::getAllOnesValue(CI->getType()), CI);
     return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
   }
-  
+
   // (X+1) >u X        --> X <u (0-1)        --> X != 255
   // (X+2) >u X        --> X <u (0-2)        --> X <u 254
   // (X+MAXUINT) >u X  --> X <u (0-MAXUINT)  --> X <u 1  --> X == 0
   if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
     return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantExpr::getNeg(CI));
-  
+
   unsigned BitWidth = CI->getType()->getPrimitiveSizeInBits();
   ConstantInt *SMax = ConstantInt::get(X->getContext(),
                                        APInt::getSignedMaxValue(BitWidth));
@@ -726,14 +727,14 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
   // (X+ -1) <s X      --> X >s (MAXSINT- -1)        --> X != 127
   if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
     return new ICmpInst(ICmpInst::ICMP_SGT, X, ConstantExpr::getSub(SMax, CI));
-  
+
   // (X+ 1) >s X       --> X <s (MAXSINT-(1-1))       --> X != 127
   // (X+ 2) >s X       --> X <s (MAXSINT-(2-1))       --> X <s 126
   // (X+MAXSINT) >s X  --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
   // (X+MINSINT) >s X  --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
   // (X+ -2) >s X      --> X <s (MAXSINT-(-2-1))      --> X <s -126
   // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
-  
+
   assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
   Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1);
   return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
@@ -745,14 +746,14 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
                                           ConstantInt *DivRHS) {
   ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
   const APInt &CmpRHSV = CmpRHS->getValue();
-  
-  // FIXME: If the operand types don't match the type of the divide 
+
+  // FIXME: If the operand types don't match the type of the divide
   // then don't attempt this transform. The code below doesn't have the
   // logic to deal with a signed divide and an unsigned compare (and
-  // vice versa). This is because (x /s C1) <s C2  produces different 
+  // vice versa). This is because (x /s C1) <s C2  produces different
   // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even
-  // (x /u C1) <u C2.  Simply casting the operands and result won't 
-  // work. :(  The if statement below tests that condition and bails 
+  // (x /u C1) <u C2.  Simply casting the operands and result won't
+  // work. :(  The if statement below tests that condition and bails
   // if it finds it.
   bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
   if (!ICI.isEquality() && DivIsSigned != ICI.isSigned())
@@ -768,14 +769,14 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   }
 
   // Compute Prod = CI * DivRHS. We are essentially solving an equation
-  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and 
-  // C2 (CI). By solving for X we can turn this into a range check 
-  // instead of computing a divide. 
+  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and
+  // C2 (CI). By solving for X we can turn this into a range check
+  // instead of computing a divide.
   Constant *Prod = ConstantExpr::getMul(CmpRHS, DivRHS);
 
   // Determine if the product overflows by seeing if the product is
   // not equal to the divide. Make sure we do the same kind of divide
-  // as in the LHS instruction that we're folding. 
+  // as in the LHS instruction that we're folding.
   bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) :
                  ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
 
@@ -785,9 +786,9 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   /// If the division is known to be exact, then there is no remainder from the
   /// divide, so the covered range size is unit, otherwise it is the divisor.
   ConstantInt *RangeSize = DivI->isExact() ? getOne(Prod) : DivRHS;
-  
+
   // Figure out the interval that is being checked.  For example, a comparison
-  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). 
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
   // Compute this interval based on the constants involved and the signedness of
   // the compare/divide.  This computes a half-open interval, keeping track of
   // whether either value in the interval overflows.  After analysis each
@@ -805,7 +806,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
       // to the same result value.
       HiOverflow = AddWithOverflow(HiBound, LoBound, RangeSize, false);
     }
-    
+
   } else if (DivRHS->getValue().isStrictlyPositive()) { // Divisor is > 0.
     if (CmpRHSV == 0) {       // (X / pos) op 0
       // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
@@ -848,7 +849,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
       if (!HiOverflow)
         HiOverflow = SubWithOverflow(HiBound, Prod, RangeSize, true);
     }
-    
+
     // Dividing by a negative swaps the condition.  LT <-> GT
     Pred = ICmpInst::getSwappedPredicate(Pred);
   }
@@ -901,7 +902,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
 Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
                                           ConstantInt *ShAmt) {
   const APInt &CmpRHSV = cast<ConstantInt>(ICI.getOperand(1))->getValue();
-  
+
   // Check that the shift amount is in range.  If not, don't perform
   // undefined shifts.  When the shift is visited it will be
   // simplified.
@@ -909,48 +910,48 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
   if (ShAmtVal >= TypeBits || ShAmtVal == 0)
     return 0;
-  
+
   if (!ICI.isEquality()) {
     // If we have an unsigned comparison and an ashr, we can't simplify this.
     // Similarly for signed comparisons with lshr.
     if (ICI.isSigned() != (Shr->getOpcode() == Instruction::AShr))
       return 0;
-    
+
     // Otherwise, all lshr and most exact ashr's are equivalent to a udiv/sdiv
     // by a power of 2.  Since we already have logic to simplify these,
     // transform to div and then simplify the resultant comparison.
     if (Shr->getOpcode() == Instruction::AShr &&
         (!Shr->isExact() || ShAmtVal == TypeBits - 1))
       return 0;
-    
+
     // Revisit the shift (to delete it).
     Worklist.Add(Shr);
-    
+
     Constant *DivCst =
       ConstantInt::get(Shr->getType(), APInt::getOneBitSet(TypeBits, ShAmtVal));
-    
+
     Value *Tmp =
       Shr->getOpcode() == Instruction::AShr ?
       Builder->CreateSDiv(Shr->getOperand(0), DivCst, "", Shr->isExact()) :
       Builder->CreateUDiv(Shr->getOperand(0), DivCst, "", Shr->isExact());
-    
+
     ICI.setOperand(0, Tmp);
-    
+
     // If the builder folded the binop, just return it.
     BinaryOperator *TheDiv = dyn_cast<BinaryOperator>(Tmp);
     if (TheDiv == 0)
       return &ICI;
-    
+
     // Otherwise, fold this div/compare.
     assert(TheDiv->getOpcode() == Instruction::SDiv ||
            TheDiv->getOpcode() == Instruction::UDiv);
-    
+
     Instruction *Res = FoldICmpDivCst(ICI, TheDiv, cast<ConstantInt>(DivCst));
     assert(Res && "This div/cst should have folded!");
     return Res;
   }
-  
-  
+
+
   // If we are comparing against bits always shifted out, the
   // comparison cannot succeed.
   APInt Comp = CmpRHSV << ShAmtVal;
@@ -959,25 +960,25 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
     Comp = Comp.lshr(ShAmtVal);
   else
     Comp = Comp.ashr(ShAmtVal);
-  
+
   if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero.
     bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
     Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
                                      IsICMP_NE);
     return ReplaceInstUsesWith(ICI, Cst);
   }
-  
+
   // Otherwise, check to see if the bits shifted out are known to be zero.
   // If so, we can compare against the unshifted value:
   //  (X & 4) >> 1 == 2  --> (X & 4) == 4.
   if (Shr->hasOneUse() && Shr->isExact())
     return new ICmpInst(ICI.getPredicate(), Shr->getOperand(0), ShiftedCmpRHS);
-  
+
   if (Shr->hasOneUse()) {
     // Otherwise strength reduce the shift into an and.
     APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
     Constant *Mask = ConstantInt::get(ICI.getContext(), Val);
-    
+
     Value *And = Builder->CreateAnd(Shr->getOperand(0),
                                     Mask, Shr->getName()+".mask");
     return new ICmpInst(ICI.getPredicate(), And, ShiftedCmpRHS);
@@ -992,7 +993,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                                           Instruction *LHSI,
                                                           ConstantInt *RHS) {
   const APInt &RHSV = RHS->getValue();
-  
+
   switch (LHSI->getOpcode()) {
   case Instruction::Trunc:
     if (ICI.isEquality() && LHSI->hasOneUse()) {
@@ -1003,7 +1004,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       APInt Mask(APInt::getHighBitsSet(SrcBits, SrcBits-DstBits));
       APInt KnownZero(SrcBits, 0), KnownOne(SrcBits, 0);
       ComputeMaskedBits(LHSI->getOperand(0), Mask, KnownZero, KnownOne);
-      
+
       // If all the high bits are known, we can do this xform.
       if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) {
         // Pull in the high bits from known-ones set.
@@ -1014,7 +1015,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       }
     }
     break;
-      
+
   case Instruction::Xor:         // (icmp pred (xor X, XorCST), CI)
     if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
       // If this is a comparison that tests the signbit (X < 0) or (x > -1),
@@ -1022,7 +1023,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if ((ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0) ||
           (ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue())) {
         Value *CompareVal = LHSI->getOperand(0);
-        
+
         // If the sign bit of the XorCST is not set, there is no change to
         // the operation, just stop using the Xor.
         if (!XorCST->isNegative()) {
@@ -1030,13 +1031,13 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           Worklist.Add(LHSI);
           return &ICI;
         }
-        
+
         // Was the old condition true if the operand is positive?
         bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT;
-        
+
         // If so, the new one isn't.
         isTrueIfPositive ^= true;
-        
+
         if (isTrueIfPositive)
           return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal,
                               SubOne(RHS));
@@ -1075,13 +1076,13 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) &&
         LHSI->getOperand(0)->hasOneUse()) {
       ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
-      
+
       // If the LHS is an AND of a truncating cast, we can widen the
       // and/compare to be the input width without changing the value
       // produced, eliminating a cast.
       if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) {
         // We can do this transformation if either the AND constant does not
-        // have its sign bit set or if it is an equality comparison. 
+        // have its sign bit set or if it is an equality comparison.
         // Extending a relational comparison when we're checking the sign
         // bit would not work.
         if (ICI.isEquality() ||
@@ -1098,7 +1099,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       // If the LHS is an AND of a zext, and we have an equality compare, we can
       // shrink the and/compare to the smaller type, eliminating the cast.
       if (ZExtInst *Cast = dyn_cast<ZExtInst>(LHSI->getOperand(0))) {
-        const IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy());
+        IntegerType *Ty = cast<IntegerType>(Cast->getSrcTy());
         // Make sure we don't compare the upper bits, SimplifyDemandedBits
         // should fold the icmp to true/false in that case.
         if (ICI.isEquality() && RHSV.getActiveBits() <= Ty->getBitWidth()) {
@@ -1118,12 +1119,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
       if (Shift && !Shift->isShift())
         Shift = 0;
-      
+
       ConstantInt *ShAmt;
       ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
-      const Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
-      const Type *AndTy = AndCST->getType();          // Type of the and.
-      
+      Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
+      Type *AndTy = AndCST->getType();          // Type of the and.
+
       // We can fold this as long as we can't shift unknown bits
       // into the mask.  This can only happen with signed shift
       // rights, as they sign-extend.
@@ -1134,20 +1135,20 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           // of the bits shifted in could be tested after the mask.
           uint32_t TyBits = Ty->getPrimitiveSizeInBits();
           int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits);
-          
+
           uint32_t BitWidth = AndTy->getPrimitiveSizeInBits();
-          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & 
+          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
                AndCST->getValue()) == 0)
             CanFold = true;
         }
-        
+
         if (CanFold) {
           Constant *NewCst;
           if (Shift->getOpcode() == Instruction::Shl)
             NewCst = ConstantExpr::getLShr(RHS, ShAmt);
           else
             NewCst = ConstantExpr::getShl(RHS, ShAmt);
-          
+
           // Check to see if we are shifting out any of the bits being
           // compared.
           if (ConstantExpr::get(Shift->getOpcode(),
@@ -1175,7 +1176,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           }
         }
       }
-      
+
       // Turn ((X >> Y) & C) == 0  into  (X & (C << Y)) == 0.  The later is
       // preferable because it allows the C<<Y expression to be hoisted out
       // of a loop if Y is invariant and X is not.
@@ -1185,21 +1186,21 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         // Compute C << Y.
         Value *NS;
         if (Shift->getOpcode() == Instruction::LShr) {
-          NS = Builder->CreateShl(AndCST, Shift->getOperand(1), "tmp");
+          NS = Builder->CreateShl(AndCST, Shift->getOperand(1));
         } else {
           // Insert a logical shift.
-          NS = Builder->CreateLShr(AndCST, Shift->getOperand(1), "tmp");
+          NS = Builder->CreateLShr(AndCST, Shift->getOperand(1));
         }
-        
+
         // Compute X & (C << Y).
-        Value *NewAnd = 
+        Value *NewAnd =
           Builder->CreateAnd(Shift->getOperand(0), NS, LHSI->getName());
-        
+
         ICI.setOperand(0, NewAnd);
         return &ICI;
       }
     }
-      
+
     // Try to optimize things like "A[i]&42 == 0" to index computations.
     if (LoadInst *LI = dyn_cast<LoadInst>(LHSI->getOperand(0))) {
       if (GetElementPtrInst *GEP =
@@ -1234,19 +1235,19 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-    
+
   case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
     ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
     if (!ShAmt) break;
-    
+
     uint32_t TypeBits = RHSV.getBitWidth();
-    
+
     // Check that the shift amount is in range.  If not, don't perform
     // undefined shifts.  When the shift is visited it will be
     // simplified.
     if (ShAmt->uge(TypeBits))
       break;
-    
+
     if (ICI.isEquality()) {
       // If we are comparing against bits always shifted out, the
       // comparison cannot succeed.
@@ -1259,34 +1260,34 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE);
         return ReplaceInstUsesWith(ICI, Cst);
       }
-      
+
       // If the shift is NUW, then it is just shifting out zeros, no need for an
       // AND.
       if (cast<BinaryOperator>(LHSI)->hasNoUnsignedWrap())
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                             ConstantExpr::getLShr(RHS, ShAmt));
-      
+
       if (LHSI->hasOneUse()) {
         // Otherwise strength reduce the shift into an and.
         uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
         Constant *Mask =
-          ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits, 
+          ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits,
                                                        TypeBits-ShAmtVal));
-        
+
         Value *And =
           Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask");
         return new ICmpInst(ICI.getPredicate(), And,
                             ConstantExpr::getLShr(RHS, ShAmt));
       }
     }
-    
+
     // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
     bool TrueIfSigned = false;
     if (LHSI->hasOneUse() &&
         isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) {
       // (X << 31) <s 0  --> (X&1) != 0
       Constant *Mask = ConstantInt::get(LHSI->getOperand(0)->getType(),
-                                        APInt::getOneBitSet(TypeBits, 
+                                        APInt::getOneBitSet(TypeBits,
                                             TypeBits-ShAmt->getZExtValue()-1));
       Value *And =
         Builder->CreateAnd(LHSI->getOperand(0), Mask, LHSI->getName()+".mask");
@@ -1295,7 +1296,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-    
+
   case Instruction::LShr:         // (icmp pred (shr X, ShAmt), CI)
   case Instruction::AShr: {
     // Handle equality comparisons of shift-by-constant.
@@ -1312,13 +1313,13 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-    
+
   case Instruction::SDiv:
   case Instruction::UDiv:
     // Fold: icmp pred ([us]div X, C1), C2 -> range test
-    // Fold this div into the comparison, producing a range check. 
-    // Determine, based on the divide type, what the range is being 
-    // checked.  If there is an overflow on the low or high side, remember 
+    // Fold this div into the comparison, producing a range check.
+    // Determine, based on the divide type, what the range is being
+    // checked.  If there is an overflow on the low or high side, remember
     // it, otherwise compute the range [low, hi) bounding the new value.
     // See: InsertRangeTest above for the kinds of replacements possible.
     if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1)))
@@ -1357,12 +1358,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
     break;
   }
-  
+
   // Simplify icmp_eq and icmp_ne instructions with integer constant RHS.
   if (ICI.isEquality()) {
     bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
-    
-    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and 
+
+    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and
     // the second operand is a constant, simplify a bit.
     if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) {
       switch (BO->getOpcode()) {
@@ -1389,7 +1390,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           // Replace ((add A, B) != 0) with (A != -B) if A or B is
           // efficiently invertible, or if the add has just this one use.
           Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
-          
+
           if (Value *NegVal = dyn_castNegVal(BOp1))
             return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
           if (Value *NegVal = dyn_castNegVal(BOp0))
@@ -1432,11 +1433,11 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           Constant *NotCI = ConstantExpr::getNot(RHS);
           if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
             return ReplaceInstUsesWith(ICI,
-                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()), 
+                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
                                        isICMP_NE));
         }
         break;
-        
+
       case Instruction::And:
         if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
           // If bits are being compared against that are and'd out, then the
@@ -1445,7 +1446,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             return ReplaceInstUsesWith(ICI,
                              ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
                                        isICMP_NE));
-          
+
           // If we have ((X & C) == C), turn it into ((X & C) != 0).
           if (RHS == BOC && RHSV.isPowerOf2())
             return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ :
@@ -1460,16 +1461,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           if (BOC->getValue().isSignBit()) {
             Value *X = BO->getOperand(0);
             Constant *Zero = Constant::getNullValue(X->getType());
-            ICmpInst::Predicate pred = isICMP_NE ? 
+            ICmpInst::Predicate pred = isICMP_NE ?
               ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
             return new ICmpInst(pred, X, Zero);
           }
-          
+
           // ((X & ~7) == 0) --> X < 8
           if (RHSV == 0 && isHighOnes(BOC)) {
             Value *X = BO->getOperand(0);
             Constant *NegX = ConstantExpr::getNeg(BOC);
-            ICmpInst::Predicate pred = isICMP_NE ? 
+            ICmpInst::Predicate pred = isICMP_NE ?
               ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
             return new ICmpInst(pred, X, NegX);
           }
@@ -1517,11 +1518,11 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
 Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0));
   Value *LHSCIOp        = LHSCI->getOperand(0);
-  const Type *SrcTy     = LHSCIOp->getType();
-  const Type *DestTy    = LHSCI->getType();
+  Type *SrcTy     = LHSCIOp->getType();
+  Type *DestTy    = LHSCI->getType();
   Value *RHSCIOp;
 
-  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the 
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
   if (TD && LHSCI->getOpcode() == Instruction::PtrToInt &&
       TD->getPointerSizeInBits() ==
@@ -1539,7 +1540,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
     if (RHSOp)
       return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
   }
-  
+
   // The code below only handles extension cast instructions, so far.
   // Enforce this.
   if (LHSCI->getOpcode() != Instruction::ZExt &&
@@ -1552,9 +1553,9 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) {
     // Not an extension from the same type?
     RHSCIOp = CI->getOperand(0);
-    if (RHSCIOp->getType() != LHSCIOp->getType()) 
+    if (RHSCIOp->getType() != LHSCIOp->getType())
       return 0;
-    
+
     // If the signedness of the two casts doesn't agree (i.e. one is a sext
     // and the other is a zext), then we can't handle this.
     if (CI->getOpcode() != LHSCI->getOpcode())
@@ -1599,7 +1600,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
     return new ICmpInst(ICI.getUnsignedPredicate(), LHSCIOp, Res1);
   }
 
-  // The re-extended constant changed so the constant cannot be represented 
+  // The re-extended constant changed so the constant cannot be represented
   // in the shorter type. Consequently, we cannot emit a simple comparison.
   // All the cases that fold to true or false will have already been handled
   // by SimplifyICmpInst, so only deal with the tricky case.
@@ -1637,26 +1638,26 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // llvm.sadd.with.overflow.  To do this, we have to replace the original add
   // with a narrower add, and discard the add-with-constant that is part of the
   // range check (if we can't eliminate it, this isn't profitable).
-  
+
   // In order to eliminate the add-with-constant, the compare can be its only
   // use.
   Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
   if (!AddWithCst->hasOneUse()) return 0;
-  
+
   // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
   if (!CI2->getValue().isPowerOf2()) return 0;
   unsigned NewWidth = CI2->getValue().countTrailingZeros();
   if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31) return 0;
-    
+
   // The width of the new add formed is 1 more than the bias.
   ++NewWidth;
-  
+
   // Check to see that CI1 is an all-ones value with NewWidth bits.
   if (CI1->getBitWidth() == NewWidth ||
       CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
     return 0;
-  
-  // In order to replace the original add with a narrower 
+
+  // In order to replace the original add with a narrower
   // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
   // and truncates that discard the high bits of the add.  Verify that this is
   // the case.
@@ -1664,7 +1665,7 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   for (Value::use_iterator UI = OrigAdd->use_begin(), E = OrigAdd->use_end();
        UI != E; ++UI) {
     if (*UI == AddWithCst) continue;
-    
+
     // Only accept truncates for now.  We would really like a nice recursive
     // predicate like SimplifyDemandedBits, but which goes downwards the use-def
     // chain to see which bits of a value are actually demanded.  If the
@@ -1674,32 +1675,32 @@ static Instruction *ProcessUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
     if (TI == 0 ||
         TI->getType()->getPrimitiveSizeInBits() > NewWidth) return 0;
   }
-  
+
   // If the pattern matches, truncate the inputs to the narrower type and
   // use the sadd_with_overflow intrinsic to efficiently compute both the
   // result and the overflow bit.
   Module *M = I.getParent()->getParent()->getParent();
-  
+
   Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
   Value *F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow,
                                        NewType);
 
   InstCombiner::BuilderTy *Builder = IC.Builder;
-  
+
   // Put the new code above the original add, in case there are any uses of the
   // add between the add and the compare.
   Builder->SetInsertPoint(OrigAdd);
-  
+
   Value *TruncA = Builder->CreateTrunc(A, NewType, A->getName()+".trunc");
   Value *TruncB = Builder->CreateTrunc(B, NewType, B->getName()+".trunc");
   CallInst *Call = Builder->CreateCall2(F, TruncA, TruncB, "sadd");
   Value *Add = Builder->CreateExtractValue(Call, 0, "sadd.result");
   Value *ZExt = Builder->CreateZExt(Add, OrigAdd->getType());
-  
+
   // The inner add was the result of the narrow add, zero extended to the
   // wider type.  Replace it with the result computed by the intrinsic.
   IC.ReplaceInstUsesWith(*OrigAdd, ZExt);
-  
+
   // The original icmp gets replaced with the overflow value.
   return ExtractValueInst::Create(Call, 1, "sadd.overflow");
 }
@@ -1709,13 +1710,13 @@ static Instruction *ProcessUAddIdiom(Instruction &I, Value *OrigAddV,
   // Don't bother doing this transformation for pointers, don't do it for
   // vectors.
   if (!isa<IntegerType>(OrigAddV->getType())) return 0;
-  
+
   // If the add is a constant expr, then we don't bother transforming it.
   Instruction *OrigAdd = dyn_cast<Instruction>(OrigAddV);
   if (OrigAdd == 0) return 0;
-  
+
   Value *LHS = OrigAdd->getOperand(0), *RHS = OrigAdd->getOperand(1);
-  
+
   // Put the new code above the original add, in case there are any uses of the
   // add between the add and the compare.
   InstCombiner::BuilderTy *Builder = IC.Builder;
@@ -1740,13 +1741,13 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
                                  unsigned BitWidth, bool isSignCheck) {
   if (isSignCheck)
     return APInt::getSignBit(BitWidth);
-  
+
   ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(1));
   if (!CI) return APInt::getAllOnesValue(BitWidth);
   const APInt &RHS = CI->getValue();
-  
+
   switch (I.getPredicate()) {
-  // For a UGT comparison, we don't care about any bits that 
+  // For a UGT comparison, we don't care about any bits that
   // correspond to the trailing ones of the comparand.  The value of these
   // bits doesn't impact the outcome of the comparison, because any value
   // greater than the RHS must differ in a bit higher than these due to carry.
@@ -1755,7 +1756,7 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
     APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingOnes);
     return ~lowBitsSet;
   }
-  
+
   // Similarly, for a ULT comparison, we don't care about the trailing zeros.
   // Any value less than the RHS must differ in a higher bit because of carries.
   case ICmpInst::ICMP_ULT: {
@@ -1763,17 +1764,17 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
     APInt lowBitsSet = APInt::getLowBitsSet(BitWidth, trailingZeros);
     return ~lowBitsSet;
   }
-  
+
   default:
     return APInt::getAllOnesValue(BitWidth);
   }
-  
+
 }
 
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  
+
   /// Orders the operands of the compare so that they are listed from most
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
@@ -1782,11 +1783,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     std::swap(Op0, Op1);
     Changed = true;
   }
-  
+
   if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
-  
-  const Type *Ty = Op0->getType();
+
+  Type *Ty = Op0->getType();
 
   // icmp's with boolean values can always be turned into bitwise operations
   if (Ty->isIntegerTy(1)) {
@@ -1835,13 +1836,13 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     BitWidth = Ty->getScalarSizeInBits();
   else if (TD)  // Pointers require TD info to get their size.
     BitWidth = TD->getTypeSizeInBits(Ty->getScalarType());
-  
+
   bool isSignBit = false;
 
   // See if we are doing a comparison with a constant.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
     Value *A = 0, *B = 0;
-    
+
     // Match the following pattern, which is a common idiom when writing
     // overflow-safe integer arithmetic function.  The source performs an
     // addition in wider type, and explicitly checks for overflow using
@@ -1849,9 +1850,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // sadd_with_overflow intrinsic.
     //
     // TODO: This could probably be generalized to handle other overflow-safe
-    // operations if we worked out the formulas to compute the appropriate 
+    // operations if we worked out the formulas to compute the appropriate
     // magic constants.
-    // 
+    //
     // sum = a + b
     // if (sum+128 >u 255)  ...  -> llvm.sadd.with.overflow.i8
     {
@@ -1861,14 +1862,14 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (Instruction *Res = ProcessUGT_ADDCST_ADD(I, A, B, CI2, CI, *this))
         return Res;
     }
-    
+
     // (icmp ne/eq (sub A B) 0) -> (icmp ne/eq A, B)
     if (I.isEquality() && CI->isZero() &&
         match(Op0, m_Sub(m_Value(A), m_Value(B)))) {
       // (icmp cond A B) if cond is equality
       return new ICmpInst(I.getPredicate(), A, B);
     }
-    
+
     // If we have an icmp le or icmp ge instruction, turn it into the
     // appropriate icmp lt or icmp gt instruction.  This allows us to rely on
     // them being folded in the code below.  The SimplifyICmpInst code has
@@ -1892,7 +1893,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
                           ConstantInt::get(CI->getContext(), CI->getValue()-1));
     }
-    
+
     // If this comparison is a normal comparison, it demands all
     // bits, if it is a sign bit comparison, it only demands the sign bit.
     bool UnusedBit;
@@ -1948,7 +1949,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     case ICmpInst::ICMP_EQ: {
       if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
         return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
-        
+
       // If all bits are known zero except for one, then we know at most one
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
@@ -1960,7 +1961,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
             LHSC->getValue() != Op0KnownZeroInverted)
           LHS = Op0;
-        
+
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) == 0" into "x != 3".
         Value *X = 0;
@@ -1969,7 +1970,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           return new ICmpInst(ICmpInst::ICMP_NE, X,
                               ConstantInt::get(X->getType(), CmpVal));
         }
-        
+
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
         // then turn "((8 >>u x)&1) == 0" into "x != 3".
         const APInt *CI;
@@ -1979,13 +1980,13 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                               ConstantInt::get(X->getType(),
                                                CI->countTrailingZeros()));
       }
-        
+
       break;
     }
     case ICmpInst::ICMP_NE: {
       if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
         return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
-      
+
       // If all bits are known zero except for one, then we know at most one
       // bit is set.   If the comparison is against zero, then this is a check
       // to see if *that* bit is set.
@@ -1997,7 +1998,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         if (!match(Op0, m_And(m_Value(LHS), m_ConstantInt(LHSC))) ||
             LHSC->getValue() != Op0KnownZeroInverted)
           LHS = Op0;
-        
+
         // If the LHS is 1 << x, and we know the result is a power of 2 like 8,
         // then turn "((1 << x)&8) != 0" into "x == 3".
         Value *X = 0;
@@ -2006,7 +2007,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           return new ICmpInst(ICmpInst::ICMP_EQ, X,
                               ConstantInt::get(X->getType(), CmpVal));
         }
-        
+
         // If the LHS is 8 >>u x, and we know the result is a power of 2 like 1,
         // then turn "((8 >>u x)&1) != 0" into "x == 3".
         const APInt *CI;
@@ -2016,7 +2017,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                               ConstantInt::get(X->getType(),
                                                CI->countTrailingZeros()));
       }
-      
+
       break;
     }
     case ICmpInst::ICMP_ULT:
@@ -2137,9 +2138,9 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   // See if we are doing a comparison between a constant and an instruction that
   // can be folded into the comparison.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-    // Since the RHS is a ConstantInt (CI), if the left hand side is an 
-    // instruction, see if that instruction also has constants so that the 
-    // instruction can be folded into the icmp 
+    // Since the RHS is a ConstantInt (CI), if the left hand side is an
+    // instruction, see if that instruction also has constants so that the
+    // instruction can be folded into the icmp
     if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
       if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
         return Res;
@@ -2194,7 +2195,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
         if (RHSC->isNullValue() && TD &&
-            TD->getIntPtrType(RHSC->getContext()) == 
+            TD->getIntPtrType(RHSC->getContext()) ==
                LHSI->getOperand(0)->getType())
           return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
                         Constant::getNullValue(LHSI->getOperand(0)->getType()));
@@ -2227,8 +2228,8 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
   // now.
   if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) {
-    if (Op0->getType()->isPointerTy() && 
-        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { 
+    if (Op0->getType()->isPointerTy() &&
+        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
       // We keep moving the cast from the left operand over to the right
       // operand, where it can often be eliminated completely.
       Op0 = CI->getOperand(0);
@@ -2250,7 +2251,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       return new ICmpInst(I.getPredicate(), Op0, Op1);
     }
   }
-  
+
   if (isa<CastInst>(Op0)) {
     // Handle the special case of: icmp (cast bool to X), <cst>
     // This comes up when you have code like
@@ -2384,7 +2385,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
             return new ICmpInst(Pred, BO0->getOperand(0),
                                 BO1->getOperand(0));
           }
-          
+
           if (CI->isMaxValue(true)) {
             ICmpInst::Predicate Pred = I.isSigned()
                                            ? I.getUnsignedPredicate()
@@ -2404,7 +2405,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
           // Mask = -1 >> count-trailing-zeros(Cst).
           if (!CI->isZero() && !CI->isOne()) {
             const APInt &AP = CI->getValue();
-            ConstantInt *Mask = ConstantInt::get(I.getContext(), 
+            ConstantInt *Mask = ConstantInt::get(I.getContext(),
                                     APInt::getLowBitsSet(AP.getBitWidth(),
                                                          AP.getBitWidth() -
                                                     AP.countTrailingZeros()));
@@ -2438,7 +2439,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       }
     }
   }
-  
+
   { Value *A, *B;
     // ~x < ~y --> y < x
     // ~x < cst --> ~cst < x
@@ -2452,11 +2453,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // (a+b) <u a  --> llvm.uadd.with.overflow.
     // (a+b) <u b  --> llvm.uadd.with.overflow.
     if (I.getPredicate() == ICmpInst::ICMP_ULT &&
-        match(Op0, m_Add(m_Value(A), m_Value(B))) && 
+        match(Op0, m_Add(m_Value(A), m_Value(B))) &&
         (Op1 == A || Op1 == B))
       if (Instruction *R = ProcessUAddIdiom(I, Op0, *this))
         return R;
-                                 
+
     // a >u (a+b)  --> llvm.uadd.with.overflow.
     // b >u (a+b)  --> llvm.uadd.with.overflow.
     if (I.getPredicate() == ICmpInst::ICMP_UGT &&
@@ -2465,7 +2466,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (Instruction *R = ProcessUAddIdiom(I, Op1, *this))
         return R;
   }
-  
+
   if (I.isEquality()) {
     Value *A, *B, *C, *D;
 
@@ -2483,10 +2484,10 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
             match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
           Constant *NC = ConstantInt::get(I.getContext(),
                                           C1->getValue() ^ C2->getValue());
-          Value *Xor = Builder->CreateXor(C, NC, "tmp");
+          Value *Xor = Builder->CreateXor(C, NC);
           return new ICmpInst(I.getPredicate(), A, Xor);
         }
-        
+
         // A^B == A^D -> B == D
         if (A == C) return new ICmpInst(I.getPredicate(), B, D);
         if (A == D) return new ICmpInst(I.getPredicate(), B, C);
@@ -2494,7 +2495,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         if (B == D) return new ICmpInst(I.getPredicate(), A, C);
       }
     }
-    
+
     if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
         (A == Op0 || B == Op0)) {
       // A == (A^B)  ->  B == 0
@@ -2504,10 +2505,10 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     }
 
     // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
-    if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) && 
+    if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
         match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
       Value *X = 0, *Y = 0, *Z = 0;
-      
+
       if (A == C) {
         X = B; Y = D; Z = A;
       } else if (A == D) {
@@ -2517,16 +2518,16 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       } else if (B == D) {
         X = A; Y = C; Z = B;
       }
-      
+
       if (X) {   // Build (X^Y) & Z
-        Op1 = Builder->CreateXor(X, Y, "tmp");
-        Op1 = Builder->CreateAnd(Op1, Z, "tmp");
+        Op1 = Builder->CreateXor(X, Y);
+        Op1 = Builder->CreateAnd(Op1, Z);
         I.setOperand(0, Op1);
         I.setOperand(1, Constant::getNullValue(Op1->getType()));
         return &I;
       }
     }
-    
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
@@ -2539,21 +2540,21 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // when it exposes other optimizations.
         !A->hasOneUse()) {
       unsigned ASize =cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
-      
+
       if (ShAmt < ASize) {
         APInt MaskV =
           APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
         MaskV <<= ShAmt;
-        
+
         APInt CmpV = Cst1->getValue().zext(ASize);
         CmpV <<= ShAmt;
-        
+
         Value *Mask = Builder->CreateAnd(A, Builder->getInt(MaskV));
         return new ICmpInst(I.getPredicate(), Mask, Builder->getInt(CmpV));
       }
     }
   }
-  
+
   {
     Value *X; ConstantInt *Cst;
     // icmp X+Cst, X
@@ -2579,31 +2580,31 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
                                                 Constant *RHSC) {
   if (!isa<ConstantFP>(RHSC)) return 0;
   const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
-  
+
   // Get the width of the mantissa.  We don't want to hack on conversions that
   // might lose information from the integer, e.g. "i64 -> float"
   int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
   if (MantissaWidth == -1) return 0;  // Unknown.
-  
+
   // Check to see that the input is converted from an integer type that is small
   // enough that preserves all bits.  TODO: check here for "known" sign bits.
   // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
   unsigned InputSize = LHSI->getOperand(0)->getType()->getScalarSizeInBits();
-  
+
   // If this is a uitofp instruction, we need an extra bit to hold the sign.
   bool LHSUnsigned = isa<UIToFPInst>(LHSI);
   if (LHSUnsigned)
     ++InputSize;
-  
+
   // If the conversion would lose info, don't hack on this.
   if ((int)InputSize > MantissaWidth)
     return 0;
-  
+
   // Otherwise, we can potentially simplify the comparison.  We know that it
   // will always come through as an integer value and we know the constant is
   // not a NAN (it would have been previously simplified).
   assert(!RHS.isNaN() && "NaN comparison not already folded!");
-  
+
   ICmpInst::Predicate Pred;
   switch (I.getPredicate()) {
   default: llvm_unreachable("Unexpected predicate!");
@@ -2636,15 +2637,15 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
   case FCmpInst::FCMP_UNO:
     return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
   }
-  
-  const IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
-  
+
+  IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+
   // Now we know that the APFloat is a normal number, zero or inf.
-  
+
   // See if the FP constant is too large for the integer.  For example,
   // comparing an i8 to 300.0.
   unsigned IntWidth = IntTy->getScalarSizeInBits();
-  
+
   if (!LHSUnsigned) {
     // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
     // and large values.
@@ -2670,7 +2671,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
       return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
     }
   }
-  
+
   if (!LHSUnsigned) {
     // See if the RHS value is < SignedMin.
     APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
@@ -2766,7 +2767,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
 
 Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   bool Changed = false;
-  
+
   /// Orders the operands of the compare so that they are listed from most
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
@@ -2776,7 +2777,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   }
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  
+
   if (Value *V = SimplifyFCmpInst(I.getPredicate(), Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
 
@@ -2792,7 +2793,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       I.setPredicate(FCmpInst::FCMP_UNO);
       I.setOperand(1, Constant::getNullValue(Op0->getType()));
       return &I;
-      
+
     case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
     case FCmpInst::FCMP_OEQ:    // True if ordered and equal
     case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
@@ -2803,7 +2804,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       return &I;
     }
   }
-    
+
   // Handle fcmp with constant RHS
   if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
     if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
@@ -2836,10 +2837,14 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         APFloat F = RHSF->getValueAPF();
         F.convert(*Sem, APFloat::rmNearestTiesToEven, &Lossy);
 
-        // Avoid lossy conversions and denormals.
+        // Avoid lossy conversions and denormals. Zero is a special case
+        // that's OK to convert.
+        APFloat Fabs = F;
+        Fabs.clearSign();
         if (!Lossy &&
-            F.compare(APFloat::getSmallestNormalized(*Sem)) !=
-                                                           APFloat::cmpLessThan)
+            ((Fabs.compare(APFloat::getSmallestNormalized(*Sem)) !=
+                 APFloat::cmpLessThan) || Fabs.isZero()))
+
           return new FCmpInst(I.getPredicate(), LHSExt->getOperand(0),
                               ConstantFP::get(RHSC->getContext(), F));
         break;
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index f499290..7446a51 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -26,7 +26,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
   if (TD) {
-    const Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
     if (AI.getArraySize()->getType() != IntPtrTy) {
       Value *V = Builder->CreateIntCast(AI.getArraySize(),
                                         IntPtrTy, false);
@@ -38,7 +38,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
   if (AI.isArrayAllocation()) {  // Check C != 1
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
-      const Type *NewTy = 
+      Type *NewTy = 
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
@@ -58,8 +58,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       Idx[0] = NullIdx;
       Idx[1] = NullIdx;
       Instruction *GEP =
-           GetElementPtrInst::CreateInBounds(New, Idx, Idx + 2,
-                                             New->getName()+".sub");
+           GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub");
       InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
@@ -92,28 +91,28 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
   User *CI = cast<User>(LI.getOperand(0));
   Value *CastOp = CI->getOperand(0);
 
-  const PointerType *DestTy = cast<PointerType>(CI->getType());
-  const Type *DestPTy = DestTy->getElementType();
-  if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+  PointerType *DestTy = cast<PointerType>(CI->getType());
+  Type *DestPTy = DestTy->getElementType();
+  if (PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
 
     // If the address spaces don't match, don't eliminate the cast.
     if (DestTy->getAddressSpace() != SrcTy->getAddressSpace())
       return 0;
 
-    const Type *SrcPTy = SrcTy->getElementType();
+    Type *SrcPTy = SrcTy->getElementType();
 
     if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() || 
          DestPTy->isVectorTy()) {
       // If the source is an array, the code below will not succeed.  Check to
       // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
       // constants.
-      if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
+      if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
         if (Constant *CSrc = dyn_cast<Constant>(CastOp))
           if (ASrcTy->getNumElements() != 0) {
             Value *Idxs[2];
             Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext()));
             Idxs[1] = Idxs[0];
-            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
+            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
             SrcTy = cast<PointerType>(CastOp->getType());
             SrcPTy = SrcTy->getElementType();
           }
@@ -133,6 +132,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
         LoadInst *NewLoad = 
           IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
         NewLoad->setAlignment(LI.getAlignment());
+        NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
         // Now cast the result of the load.
         return new BitCastInst(NewLoad, LI.getType());
       }
@@ -163,8 +163,9 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
       return Res;
 
-  // None of the following transforms are legal for volatile loads.
-  if (LI.isVolatile()) return 0;
+  // None of the following transforms are legal for volatile/atomic loads.
+  // FIXME: Some of it is okay for atomic loads; needs refactoring.
+  if (!LI.isSimple()) return 0;
   
   // Do really simple store-to-load forwarding and load CSE, to catch cases
   // where there are several consecutive memory accesses to the same location,
@@ -256,11 +257,11 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   User *CI = cast<User>(SI.getOperand(1));
   Value *CastOp = CI->getOperand(0);
 
-  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
-  const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
+  Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
   if (SrcTy == 0) return 0;
   
-  const Type *SrcPTy = SrcTy->getElementType();
+  Type *SrcPTy = SrcTy->getElementType();
 
   if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
     return 0;
@@ -280,12 +281,12 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     NewGEPIndices.push_back(Zero);
     
     while (1) {
-      if (const StructType *STy = dyn_cast<StructType>(SrcPTy)) {
+      if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
         if (!STy->getNumElements()) /* Struct can be empty {} */
           break;
         NewGEPIndices.push_back(Zero);
         SrcPTy = STy->getElementType(0);
-      } else if (const ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
+      } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcPTy)) {
         NewGEPIndices.push_back(Zero);
         SrcPTy = ATy->getElementType();
       } else {
@@ -314,8 +315,8 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   Value *NewCast;
   Value *SIOp0 = SI.getOperand(0);
   Instruction::CastOps opcode = Instruction::BitCast;
-  const Type* CastSrcTy = SIOp0->getType();
-  const Type* CastDstTy = SrcPTy;
+  Type* CastSrcTy = SIOp0->getType();
+  Type* CastDstTy = SrcPTy;
   if (CastDstTy->isPointerTy()) {
     if (CastSrcTy->isIntegerTy())
       opcode = Instruction::IntToPtr;
@@ -327,8 +328,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   // SIOp0 is a pointer to aggregate and this is a store to the first field,
   // emit a GEP to index into its first field.
   if (!NewGEPIndices.empty())
-    CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices.begin(),
-                                           NewGEPIndices.end());
+    CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
   
   NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
                                    SIOp0->getName()+".c");
@@ -370,21 +370,6 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   Value *Val = SI.getOperand(0);
   Value *Ptr = SI.getOperand(1);
 
-  // If the RHS is an alloca with a single use, zapify the store, making the
-  // alloca dead.
-  if (!SI.isVolatile()) {
-    if (Ptr->hasOneUse()) {
-      if (isa<AllocaInst>(Ptr)) 
-        return EraseInstFromFunction(SI);
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
-        if (isa<AllocaInst>(GEP->getOperand(0))) {
-          if (GEP->getOperand(0)->hasOneUse())
-            return EraseInstFromFunction(SI);
-        }
-      }
-    }
-  }
-
   // Attempt to improve the alignment.
   if (TD) {
     unsigned KnownAlign =
@@ -400,6 +385,23 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       SI.setAlignment(EffectiveStoreAlign);
   }
 
+  // Don't hack volatile/atomic stores.
+  // FIXME: Some bits are legal for atomic stores; needs refactoring.
+  if (!SI.isSimple()) return 0;
+
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  if (Ptr->hasOneUse()) {
+    if (isa<AllocaInst>(Ptr)) 
+      return EraseInstFromFunction(SI);
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+      if (isa<AllocaInst>(GEP->getOperand(0))) {
+        if (GEP->getOperand(0)->hasOneUse())
+          return EraseInstFromFunction(SI);
+      }
+    }
+  }
+
   // Do really simple DSE, to catch cases where there are several consecutive
   // stores to the same location, separated by a few arithmetic operations. This
   // situation often occurs with bitfield accesses.
@@ -417,8 +419,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
-      if (!PrevSI->isVolatile() &&equivalentAddressValues(PrevSI->getOperand(1),
-                                                          SI.getOperand(1))) {
+      if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
+                                                        SI.getOperand(1))) {
         ++NumDeadStore;
         ++BBI;
         EraseInstFromFunction(*PrevSI);
@@ -432,7 +434,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     // then *this* store is dead (X = load P; store X -> P).
     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
-          !SI.isVolatile())
+          LI->isSimple())
         return EraseInstFromFunction(SI);
       
       // Otherwise, this is a load from some other location.  Stores before it
@@ -444,9 +446,6 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
       break;
   }
-  
-  
-  if (SI.isVolatile()) return 0;  // Don't hack volatile stores.
 
   // store X, null    -> turns into 'unreachable' in SimplifyCFG
   if (isa<ConstantPointerNull>(Ptr) && SI.getPointerAddressSpace() == 0) {
@@ -549,11 +548,11 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
         return false;
       --BBI;
     }
-    // If this isn't a store, isn't a store to the same location, or if the
-    // alignments differ, bail out.
+    // If this isn't a store, isn't a store to the same location, or is not the
+    // right kind of store, bail out.
     OtherStore = dyn_cast<StoreInst>(BBI);
     if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
-        OtherStore->getAlignment() != SI.getAlignment())
+        !SI.isSameOperationAs(OtherStore))
       return false;
   } else {
     // Otherwise, the other block ended with a conditional branch. If one of the
@@ -569,7 +568,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
       // Check to see if we find the matching store.
       if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
         if (OtherStore->getOperand(1) != SI.getOperand(1) ||
-            OtherStore->getAlignment() != SI.getAlignment())
+            !SI.isSameOperationAs(OtherStore))
           return false;
         break;
       }
@@ -601,10 +600,12 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   
   // Advance to a place where it is safe to insert the new store and
   // insert it.
-  BBI = DestBB->getFirstNonPHI();
+  BBI = DestBB->getFirstInsertionPt();
   StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1),
-                                   OtherStore->isVolatile(),
-                                   SI.getAlignment());
+                                   SI.isVolatile(),
+                                   SI.getAlignment(),
+                                   SI.getOrdering(),
+                                   SI.getSynchScope());
   InsertNewInstBefore(NewSI, *BBI);
   NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 630a6fe..7f48125 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -38,7 +38,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
                       m_Value(B))) &&
       // The "1" can be any value known to be a power of 2.
       isPowerOfTwo(PowerOf2, IC.getTargetData())) {
-    A = IC.Builder->CreateSub(A, B, "tmp");
+    A = IC.Builder->CreateSub(A, B);
     return IC.Builder->CreateShl(PowerOf2, A);
   }
   
@@ -131,7 +131,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     { Value *X; ConstantInt *C1;
       if (Op0->hasOneUse() &&
           match(Op0, m_Add(m_Value(X), m_ConstantInt(C1)))) {
-        Value *Add = Builder->CreateMul(X, CI, "tmp");
+        Value *Add = Builder->CreateMul(X, CI);
         return BinaryOperator::CreateAdd(Add, Builder->CreateMul(C1, CI));
       }
     }
@@ -244,7 +244,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
     if (BoolCast) {
       Value *V = Builder->CreateSub(Constant::getNullValue(I.getType()),
-                                    BoolCast, "tmp");
+                                    BoolCast);
       return BinaryOperator::CreateAnd(V, OtherOp);
     }
   }
@@ -421,7 +421,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
 
 /// dyn_castZExtVal - Checks if V is a zext or constant that can
 /// be truncated to Ty without losing bits.
-static Value *dyn_castZExtVal(Value *V, const Type *Ty) {
+static Value *dyn_castZExtVal(Value *V, Type *Ty) {
   if (ZExtInst *Z = dyn_cast<ZExtInst>(V)) {
     if (Z->getSrcTy() == Ty)
       return Z->getOperand(0);
@@ -466,8 +466,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   { const APInt *CI; Value *N;
     if (match(Op1, m_Shl(m_Power2(CI), m_Value(N)))) {
       if (*CI != 1)
-        N = Builder->CreateAdd(N, ConstantInt::get(I.getType(), CI->logBase2()),
-                               "tmp");
+        N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2()));
       if (I.isExact())
         return BinaryOperator::CreateExactLShr(Op0, N);
       return BinaryOperator::CreateLShr(Op0, N);
@@ -630,7 +629,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
   if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
-    Value *Add = Builder->CreateAdd(Op1, N1, "tmp");
+    Value *Add = Builder->CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 3777340..664546c 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -28,8 +28,8 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   Value *LHSVal = FirstInst->getOperand(0);
   Value *RHSVal = FirstInst->getOperand(1);
     
-  const Type *LHSType = LHSVal->getType();
-  const Type *RHSType = RHSVal->getType();
+  Type *LHSType = LHSVal->getType();
+  Type *RHSType = RHSVal->getType();
   
   bool isNUW = false, isNSW = false, isExact = false;
   if (OverflowingBinaryOperator *BO =
@@ -229,8 +229,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   
   Value *Base = FixedOperands[0];
   GetElementPtrInst *NewGEP = 
-    GetElementPtrInst::Create(Base, FixedOperands.begin()+1,
-                              FixedOperands.end());
+    GetElementPtrInst::Create(Base, makeArrayRef(FixedOperands).slice(1));
   if (AllInBounds) NewGEP->setIsInBounds();
   NewGEP->setDebugLoc(FirstInst->getDebugLoc());
   return NewGEP;
@@ -287,7 +286,12 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
 
 Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
-  
+
+  // FIXME: This is overconservative; this transform is allowed in some cases
+  // for atomic operations.
+  if (FirstLI->isAtomic())
+    return 0;
+
   // When processing loads, we need to propagate two bits of information to the
   // sunk load: whether it is volatile, and what its alignment is.  We currently
   // don't sink loads when some have their alignment specified and some don't.
@@ -397,7 +401,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   // the same type or "+42") we can pull the operation through the PHI, reducing
   // code size and simplifying code.
   Constant *ConstantOp = 0;
-  const Type *CastSrcTy = 0;
+  Type *CastSrcTy = 0;
   bool isNUW = false, isNSW = false, isExact = false;
   
   if (isa<CastInst>(FirstInst)) {
@@ -572,7 +576,7 @@ struct LoweredPHIRecord {
   unsigned Shift;     // The amount shifted.
   unsigned Width;     // The width extracted.
   
-  LoweredPHIRecord(PHINode *pn, unsigned Sh, const Type *Ty)
+  LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
     : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
   
   // Ctor form used by DenseMap.
@@ -701,7 +705,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
     unsigned PHIId = PHIUsers[UserI].PHIId;
     PHINode *PN = PHIsToSlice[PHIId];
     unsigned Offset = PHIUsers[UserI].Shift;
-    const Type *Ty = PHIUsers[UserI].Inst->getType();
+    Type *Ty = PHIUsers[UserI].Inst->getType();
     
     PHINode *EltPHI;
     
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 5733c20..91e60a4 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -323,9 +324,14 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
     }
 
     // All operands were constants, fold it.
-    if (ConstOps.size() == I->getNumOperands())
+    if (ConstOps.size() == I->getNumOperands()) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        if (!LI->isVolatile())
+          return ConstantFoldLoadFromConstPtr(ConstOps[0], TD);
+
       return ConstantFoldInstOperands(I->getOpcode(), I->getType(),
-                                      ConstOps.data(), ConstOps.size(), TD);
+                                      ConstOps, TD);
+    }
   }
 
   return 0;
@@ -363,7 +369,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
       case ICmpInst::ICMP_UGT:
       case ICmpInst::ICMP_SGT: {
         // These transformations only work for selects over integers.
-        const IntegerType *SelectTy = dyn_cast<IntegerType>(SI.getType());
+        IntegerType *SelectTy = dyn_cast<IntegerType>(SI.getType());
         if (!SelectTy)
           break;
 
@@ -443,7 +449,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
   // FIXME: Type and constness constraints could be lifted, but we have to
   //        watch code size carefully. We should consider xor instead of
   //        sub/add when we decide to do that.
-  if (const IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
+  if (IntegerType *Ty = dyn_cast<IntegerType>(CmpLHS->getType())) {
     if (TrueVal->getType() == Ty) {
       if (ConstantInt *Cmp = dyn_cast<ConstantInt>(CmpRHS)) {
         ConstantInt *C1 = NULL, *C2 = NULL;
@@ -476,10 +482,16 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
         SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
       return ReplaceInstUsesWith(SI, FalseVal);
+    if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
+        SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
+      return ReplaceInstUsesWith(SI, FalseVal);
   } else if (Pred == ICmpInst::ICMP_NE) {
     if (SimplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, TD) == FalseVal ||
         SimplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, TD) == FalseVal)
       return ReplaceInstUsesWith(SI, TrueVal);
+    if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, TD) == TrueVal ||
+        SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, TD) == TrueVal)
+      return ReplaceInstUsesWith(SI, TrueVal);
   }
 
   // NOTE: if we wanted to, this is where to detect integer MIN/MAX
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 811f949..6d85add 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -13,6 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
@@ -207,11 +208,12 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     return I;
     
   case Instruction::Shl: {
-    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+    BinaryOperator *BO = cast<BinaryOperator>(I);
+    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
 
     // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(I->getOperand(1));
-    
+    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+
     // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
     if (isLeftShift) {
       // If this is oversized composite shift, then unsigned shifts get 0.
@@ -219,7 +221,9 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       if (NewShAmt >= TypeWidth)
         return Constant::getNullValue(I->getType());
 
-      I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt));
+      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
+      BO->setHasNoUnsignedWrap(false);
+      BO->setHasNoSignedWrap(false);
       return I;
     }
     
@@ -227,11 +231,11 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // zeros.
     if (CI->getValue() == NumBits) {
       APInt Mask(APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits));
-      V = IC.Builder->CreateAnd(I->getOperand(0),
-                                ConstantInt::get(I->getContext(), Mask));
+      V = IC.Builder->CreateAnd(BO->getOperand(0),
+                                ConstantInt::get(BO->getContext(), Mask));
       if (Instruction *VI = dyn_cast<Instruction>(V)) {
-        VI->moveBefore(I);
-        VI->takeName(I);
+        VI->moveBefore(BO);
+        VI->takeName(BO);
       }
       return V;
     }
@@ -239,23 +243,27 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
-    I->setOperand(1, ConstantInt::get(I->getType(),
-                                      CI->getZExtValue() - NumBits));
-    return I;
+    BO->setOperand(1, ConstantInt::get(BO->getType(),
+                                       CI->getZExtValue() - NumBits));
+    BO->setHasNoUnsignedWrap(false);
+    BO->setHasNoSignedWrap(false);
+    return BO;
   }
   case Instruction::LShr: {
-    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+    BinaryOperator *BO = cast<BinaryOperator>(I);
+    unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
     // We only accept shifts-by-a-constant in CanEvaluateShifted.
-    ConstantInt *CI = cast<ConstantInt>(I->getOperand(1));
+    ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
     
     // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
     if (!isLeftShift) {
       // If this is oversized composite shift, then unsigned shifts get 0.
       unsigned NewShAmt = NumBits+CI->getZExtValue();
       if (NewShAmt >= TypeWidth)
-        return Constant::getNullValue(I->getType());
+        return Constant::getNullValue(BO->getType());
       
-      I->setOperand(1, ConstantInt::get(I->getType(), NewShAmt));
+      BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
+      BO->setIsExact(false);
       return I;
     }
     
@@ -264,7 +272,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     if (CI->getValue() == NumBits) {
       APInt Mask(APInt::getHighBitsSet(TypeWidth, TypeWidth - NumBits));
       V = IC.Builder->CreateAnd(I->getOperand(0),
-                                ConstantInt::get(I->getContext(), Mask));
+                                ConstantInt::get(BO->getContext(), Mask));
       if (Instruction *VI = dyn_cast<Instruction>(V)) {
         VI->moveBefore(I);
         VI->takeName(I);
@@ -275,9 +283,10 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
-    I->setOperand(1, ConstantInt::get(I->getType(),
-                                      CI->getZExtValue() - NumBits));
-    return I;
+    BO->setOperand(1, ConstantInt::get(BO->getType(),
+                                       CI->getZExtValue() - NumBits));
+    BO->setIsExact(false);
+    return BO;
   }
     
   case Instruction::Select:
@@ -528,7 +537,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     
     uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
     
-    const IntegerType *Ty = cast<IntegerType>(I.getType());
+    IntegerType *Ty = cast<IntegerType>(I.getType());
     
     // Check for (X << c1) << c2  and  (X >> c1) >> c2
     if (I.getOpcode() == ShiftOp->getOpcode()) {
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8fea8eb..5cd9a4b 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -103,7 +103,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   assert(V != 0 && "Null pointer of Value???");
   assert(Depth <= 6 && "Limit Search Depth");
   uint32_t BitWidth = DemandedMask.getBitWidth();
-  const Type *VTy = V->getType();
+  Type *VTy = V->getType();
   assert((TD || !VTy->isPointerTy()) &&
          "SimplifyDemandedBits needs to know bit widths!");
   assert((!TD || TD->getTypeSizeInBits(VTy->getScalarType()) == BitWidth) &&
@@ -325,8 +325,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
         Constant *AndC = Constant::getIntegerValue(VTy,
                                                    ~RHSKnownOne & DemandedMask);
-        Instruction *And = 
-          BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
+        Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
         return InsertNewInstWith(And, *I);
       }
     }
@@ -351,14 +350,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         
         Constant *AndC =
           ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
-        Instruction *NewAnd = 
-          BinaryOperator::CreateAnd(I->getOperand(0), AndC, "tmp");
+        Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
         InsertNewInstWith(NewAnd, *I);
         
         Constant *XorC =
           ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
-        Instruction *NewXor =
-          BinaryOperator::CreateXor(NewAnd, XorC, "tmp");
+        Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
         return InsertNewInstWith(NewXor, *I);
       }
 
@@ -404,8 +401,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
       return 0;  // vector->int or fp->int?
 
-    if (const VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
-      if (const VectorType *SrcVTy =
+    if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
+      if (VectorType *SrcVTy =
             dyn_cast<VectorType>(I->getOperand(0)->getType())) {
         if (DstVTy->getNumElements() != SrcVTy->getNumElements())
           // Don't touch a bitcast between vectors of different element counts.
@@ -826,7 +823,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
   UndefElts = 0;
   if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
-    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Undef = UndefValue::get(EltTy);
 
     std::vector<Constant*> Elts;
@@ -855,7 +852,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     if (DemandedElts.isAllOnesValue())
       return 0;
     
-    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Type *EltTy = cast<VectorType>(V->getType())->getElementType();
     Constant *Zero = Constant::getNullValue(EltTy);
     Constant *Undef = UndefValue::get(EltTy);
     std::vector<Constant*> Elts;
@@ -962,6 +959,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       unsigned MaskVal = Shuffle->getMaskValue(i);
       if (MaskVal == -1u) {
         UndefElts.setBit(i);
+      } else if (!DemandedElts[i]) {
+        NewUndefElts = true;
+        UndefElts.setBit(i);
       } else if (MaskVal < LHSVWidth) {
         if (UndefElts4[MaskVal]) {
           NewUndefElts = true;
@@ -992,7 +992,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
   }
   case Instruction::BitCast: {
     // Vector->vector casts only.
-    const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
     if (!VTy) break;
     unsigned InVWidth = VTy->getNumElements();
     APInt InputDemandedElts(InVWidth, 0);
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index ad6a8d0..154267c 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -77,7 +77,7 @@ static std::vector<int> getShuffleMask(const ShuffleVectorInst *SVI) {
 /// extracted from the vector.
 static Value *FindScalarElement(Value *V, unsigned EltNo) {
   assert(V->getType()->isVectorTy() && "Not looking at a vector?");
-  const VectorType *PTy = cast<VectorType>(V->getType());
+  VectorType *PTy = cast<VectorType>(V->getType());
   unsigned Width = PTy->getNumElements();
   if (EltNo >= Width)  // Out of range access.
     return UndefValue::get(PTy->getElementType());
@@ -175,7 +175,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     // the same number of elements, see if we can find the source element from
     // it.  In this case, we will end up needing to bitcast the scalars.
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
-      if (const VectorType *VT =
+      if (VectorType *VT =
           dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
         if (VT->getNumElements() == VectorWidth)
           if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
@@ -225,7 +225,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
           SrcIdx -= LHSWidth;
           Src = SVI->getOperand(1);
         }
-        const Type *Int32Ty = Type::getInt32Ty(EI.getContext());
+        Type *Int32Ty = Type::getInt32Ty(EI.getContext());
         return ExtractElementInst::Create(Src,
                                           ConstantInt::get(Int32Ty,
                                                            SrcIdx, false));
@@ -555,7 +555,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         // shuffle mask, do the replacement.
         if (isSplat || NewMask == LHSMask || NewMask == Mask) {
           std::vector<Constant*> Elts;
-          const Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+          Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
           for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
             if (NewMask[i] < 0) {
               Elts.push_back(UndefValue::get(Int32Ty));
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index ab98ef9..92874b9 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -46,8 +46,10 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm-c/Initialization.h"
 #include <algorithm>
 #include <climits>
@@ -83,7 +85,7 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
 /// ShouldChangeType - Return true if it is desirable to convert a computation
 /// from 'From' to 'To'.  We don't want to convert from a legal to an illegal
 /// type for example, or from a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(const Type *From, const Type *To) const {
+bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
   
   // If we don't have TD, we don't know if the source/dest are legal.
@@ -107,6 +109,43 @@ bool InstCombiner::ShouldChangeType(const Type *From, const Type *To) const {
   return true;
 }
 
+// Return true, if No Signed Wrap should be maintained for I.
+// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
+// where both B and C should be ConstantInts, results in a constant that does
+// not overflow. This function only handles the Add and Sub opcodes. For
+// all other opcodes, the function conservatively returns false.
+static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+  OverflowingBinaryOperator *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+  if (!OBO || !OBO->hasNoSignedWrap()) {
+    return false;
+  }
+
+  // We reason about Add and Sub Only.
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (Opcode != Instruction::Add && 
+      Opcode != Instruction::Sub) {
+    return false;
+  }
+
+  ConstantInt *CB = dyn_cast<ConstantInt>(B);
+  ConstantInt *CC = dyn_cast<ConstantInt>(C);
+
+  if (!CB || !CC) {
+    return false;
+  }
+
+  const APInt &BVal = CB->getValue();
+  const APInt &CVal = CC->getValue();
+  bool Overflow = false;
+
+  if (Opcode == Instruction::Add) {
+    BVal.sadd_ov(CVal, Overflow);
+  } else {
+    BVal.ssub_ov(CVal, Overflow);
+  }
+
+  return !Overflow;
+}
 
 /// SimplifyAssociativeOrCommutative - This performs a few simplifications for
 /// operators which are associative or commutative:
@@ -158,7 +197,16 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
           I.setOperand(1, V);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
-          I.clearSubclassOptionalData();
+          if (MaintainNoSignedWrap(I, B, C) &&
+	      (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) {
+            // Note: this is only valid because SimplifyBinOp doesn't look at
+            // the operands to Op0.
+            I.clearSubclassOptionalData();
+            I.setHasNoSignedWrap(true);
+          } else {
+            I.clearSubclassOptionalData();
+          }
+            
           Changed = true;
           ++NumReassoc;
           continue;
@@ -240,7 +288,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         Constant *C2 = cast<Constant>(Op1->getOperand(1));
 
         Constant *Folded = ConstantExpr::get(Opcode, C1, C2);
-        Instruction *New = BinaryOperator::Create(Opcode, A, B);
+        BinaryOperator *New = BinaryOperator::Create(Opcode, A, B);
         InsertNewInstWith(New, I);
         New->takeName(Op1);
         I.setOperand(0, New);
@@ -248,6 +296,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         // Conservatively clear the optional flags, since they may not be
         // preserved by the reassociation.
         I.clearSubclassOptionalData();
+
         Changed = true;
         continue;
       }
@@ -516,8 +565,8 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
     // If it's a bitcast involving vectors, make sure it has the same number of
     // elements on both sides.
     if (BitCastInst *BC = dyn_cast<BitCastInst>(&Op)) {
-      const VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
-      const VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+      VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
+      VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
 
       // Verify that either both or neither are vectors.
       if ((SrcTy == NULL) != (DestTy == NULL)) return 0;
@@ -654,7 +703,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     }
   } else { 
     CastInst *CI = cast<CastInst>(&I);
-    const Type *RetTy = CI->getType();
+    Type *RetTy = CI->getType();
     for (unsigned i = 0; i != NumPHIValues; ++i) {
       Value *InV;
       if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
@@ -680,7 +729,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 /// or not there is a sequence of GEP indices into the type that will land us at
 /// the specified offset.  If so, fill them into NewIndices and return the
 /// resultant element type, otherwise return null.
-const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset, 
+Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, 
                                           SmallVectorImpl<Value*> &NewIndices) {
   if (!TD) return 0;
   if (!Ty->isSized()) return 0;
@@ -688,7 +737,7 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  const Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
   int64_t FirstIdx = 0;
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
@@ -711,7 +760,7 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
     if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty))
       return 0;
     
-    if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    if (StructType *STy = dyn_cast<StructType>(Ty)) {
       const StructLayout *SL = TD->getStructLayout(STy);
       assert(Offset < (int64_t)SL->getSizeInBytes() &&
              "Offset must stay within the indexed type");
@@ -722,7 +771,7 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
       
       Offset -= SL->getElementOffset(Elt);
       Ty = STy->getElementType(Elt);
-    } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
       uint64_t EltSize = TD->getTypeAllocSize(AT->getElementType());
       assert(EltSize && "Cannot index into a zero-sized array");
       NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
@@ -737,12 +786,20 @@ const Type *InstCombiner::FindElementAtOffset(const Type *Ty, int64_t Offset,
   return Ty;
 }
 
-
+static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
+  // If this GEP has only 0 indices, it is the same pointer as
+  // Src. If Src is not a trivial GEP too, don't combine
+  // the indices.
+  if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
+      !Src.hasOneUse())
+    return false;
+  return true;
+}
 
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
-  if (Value *V = SimplifyGEPInst(&Ops[0], Ops.size(), TD))
+  if (Value *V = SimplifyGEPInst(Ops, TD))
     return ReplaceInstUsesWith(GEP, V);
 
   Value *PtrOp = GEP.getOperand(0);
@@ -751,13 +808,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // by multiples of a zero size type with zero.
   if (TD) {
     bool MadeChange = false;
-    const Type *IntPtrTy = TD->getIntPtrType(GEP.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(GEP.getContext());
 
     gep_type_iterator GTI = gep_type_begin(GEP);
     for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end();
          I != E; ++I, ++GTI) {
       // Skip indices into struct types.
-      const SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
+      SequentialType *SeqTy = dyn_cast<SequentialType>(*GTI);
       if (!SeqTy) continue;
 
       // If the element type has zero size then any index over it is equivalent
@@ -785,21 +842,15 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // getelementptr instructions into a single instruction.
   //
   if (GEPOperator *Src = dyn_cast<GEPOperator>(PtrOp)) {
-
-    // If this GEP has only 0 indices, it is the same pointer as
-    // Src. If Src is not a trivial GEP too, don't combine
-    // the indices.
-    if (GEP.hasAllZeroIndices() && !Src->hasAllZeroIndices() &&
-        !Src->hasOneUse())
+    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
       return 0;
 
     // Note that if our source is a gep chain itself that we wait for that
     // chain to be resolved before we perform this transformation.  This
     // avoids us creating a TON of code in some cases.
-    //
-    if (GetElementPtrInst *SrcGEP =
-          dyn_cast<GetElementPtrInst>(Src->getOperand(0)))
-      if (SrcGEP->getNumOperands() == 2)
+    if (GEPOperator *SrcGEP =
+          dyn_cast<GEPOperator>(Src->getOperand(0)))
+      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
         return 0;   // Wait until our source is folded to completion.
 
     SmallVector<Value*, 8> Indices;
@@ -851,15 +902,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
     if (!Indices.empty())
       return (GEP.isInBounds() && Src->isInBounds()) ?
-        GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices.begin(),
-                                          Indices.end(), GEP.getName()) :
-        GetElementPtrInst::Create(Src->getOperand(0), Indices.begin(),
-                                  Indices.end(), GEP.getName());
+        GetElementPtrInst::CreateInBounds(Src->getOperand(0), Indices,
+                                          GEP.getName()) :
+        GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
   }
 
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
   Value *StrippedPtr = PtrOp->stripPointerCasts();
-  const PointerType *StrippedPtrTy =cast<PointerType>(StrippedPtr->getType());
+  PointerType *StrippedPtrTy =cast<PointerType>(StrippedPtr->getType());
   if (StrippedPtr != PtrOp &&
     StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
@@ -875,21 +925,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     //
     // This occurs when the program declares an array extern like "int X[];"
     if (HasZeroPointerIndex) {
-      const PointerType *CPTy = cast<PointerType>(PtrOp->getType());
-      if (const ArrayType *CATy =
+      PointerType *CPTy = cast<PointerType>(PtrOp->getType());
+      if (ArrayType *CATy =
           dyn_cast<ArrayType>(CPTy->getElementType())) {
         // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
         if (CATy->getElementType() == StrippedPtrTy->getElementType()) {
           // -> GEP i8* X, ...
           SmallVector<Value*, 8> Idx(GEP.idx_begin()+1, GEP.idx_end());
           GetElementPtrInst *Res =
-            GetElementPtrInst::Create(StrippedPtr, Idx.begin(),
-                                      Idx.end(), GEP.getName());
+            GetElementPtrInst::Create(StrippedPtr, Idx, GEP.getName());
           Res->setIsInBounds(GEP.isInBounds());
           return Res;
         }
         
-        if (const ArrayType *XATy =
+        if (ArrayType *XATy =
               dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){
           // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
           if (CATy->getElementType() == XATy->getElementType()) {
@@ -907,8 +956,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // Transform things like:
       // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
-      const Type *SrcElTy = StrippedPtrTy->getElementType();
-      const Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      Type *SrcElTy = StrippedPtrTy->getElementType();
+      Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
       if (TD && SrcElTy->isArrayTy() &&
           TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
           TD->getTypeAllocSize(ResElTy)) {
@@ -916,8 +965,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
         Idx[1] = GEP.getOperand(1);
         Value *NewGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(StrippedPtr, Idx, Idx + 2, GEP.getName()) :
-          Builder->CreateGEP(StrippedPtr, Idx, Idx + 2, GEP.getName());
+          Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
+          Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
         // V and GEP are both pointer types --> BitCast
         return new BitCastInst(NewGEP, GEP.getType());
       }
@@ -975,8 +1024,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
           Idx[1] = NewIdx;
           Value *NewGEP = GEP.isInBounds() ?
-            Builder->CreateInBoundsGEP(StrippedPtr, Idx, Idx + 2,GEP.getName()):
-            Builder->CreateGEP(StrippedPtr, Idx, Idx + 2, GEP.getName());
+            Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()):
+            Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
           // The NewGEP must be pointer typed, so must the old one -> BitCast
           return new BitCastInst(NewGEP, GEP.getType());
         }
@@ -1023,14 +1072,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
       SmallVector<Value*, 8> NewIndices;
-      const Type *InTy =
+      Type *InTy =
         cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
       if (FindElementAtOffset(InTy, Offset, NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices.begin(),
-                                     NewIndices.end()) :
-          Builder->CreateGEP(BCI->getOperand(0), NewIndices.begin(),
-                             NewIndices.end());
+          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
+          Builder->CreateGEP(BCI->getOperand(0), NewIndices);
         
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
@@ -1045,15 +1092,43 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
 
 
-static bool IsOnlyNullComparedAndFreed(const Value &V) {
-  for (Value::const_use_iterator UI = V.use_begin(), UE = V.use_end();
+static bool IsOnlyNullComparedAndFreed(Value *V, SmallVectorImpl<WeakVH> &Users,
+                                       int Depth = 0) {
+  if (Depth == 8)
+    return false;
+
+  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end();
        UI != UE; ++UI) {
-    const User *U = *UI;
-    if (isFreeCall(U))
+    User *U = *UI;
+    if (isFreeCall(U)) {
+      Users.push_back(U);
       continue;
-    if (const ICmpInst *ICI = dyn_cast<ICmpInst>(U))
-      if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1)))
+    }
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) {
+      if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1))) {
+        Users.push_back(ICI);
+        continue;
+      }
+    }
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (IsOnlyNullComparedAndFreed(BCI, Users, Depth+1)) {
+        Users.push_back(BCI);
+        continue;
+      }
+    }
+    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+      if (IsOnlyNullComparedAndFreed(GEPI, Users, Depth+1)) {
+        Users.push_back(GEPI);
+        continue;
+      }
+    }
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        Users.push_back(II);
         continue;
+      }
+    }
     return false;
   }
   return true;
@@ -1063,25 +1138,20 @@ Instruction *InstCombiner::visitMalloc(Instruction &MI) {
   // If we have a malloc call which is only used in any amount of comparisons
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
-  if (IsOnlyNullComparedAndFreed(MI)) {
-    for (Value::use_iterator UI = MI.use_begin(), UE = MI.use_end();
-         UI != UE;) {
-      // We can assume that every remaining use is a free call or an icmp eq/ne
-      // to null, so the cast is safe.
-      Instruction *I = cast<Instruction>(*UI);
-
-      // Early increment here, as we're about to get rid of the user.
-      ++UI;
-
-      if (isFreeCall(I)) {
-        EraseInstFromFunction(*cast<CallInst>(I));
-        continue;
+  SmallVector<WeakVH, 64> Users;
+  if (IsOnlyNullComparedAndFreed(&MI, Users)) {
+    for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+      Instruction *I = cast_or_null<Instruction>(&*Users[i]);
+      if (!I) continue;
+
+      if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
+        ReplaceInstUsesWith(*C,
+                            ConstantInt::get(Type::getInt1Ty(C->getContext()),
+                                             C->isFalseWhenEqual()));
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        ReplaceInstUsesWith(*I, UndefValue::get(I->getType()));
       }
-      // Again, the cast is safe.
-      ICmpInst *C = cast<ICmpInst>(I);
-      ReplaceInstUsesWith(*C, ConstantInt::get(Type::getInt1Ty(C->getContext()),
-                                               C->isFalseWhenEqual()));
-      EraseInstFromFunction(*C);
+      EraseInstFromFunction(*I);
     }
     return EraseInstFromFunction(MI);
   }
@@ -1120,8 +1190,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       !isa<Constant>(X)) {
     // Swap Destinations and condition...
     BI.setCondition(X);
-    BI.setSuccessor(0, FalseDest);
-    BI.setSuccessor(1, TrueDest);
+    BI.swapSuccessors();
     return &BI;
   }
 
@@ -1136,8 +1205,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       Cond->setPredicate(FCmpInst::getInversePredicate(FPred));
       
       // Swap Destinations and condition.
-      BI.setSuccessor(0, FalseDest);
-      BI.setSuccessor(1, TrueDest);
+      BI.swapSuccessors();
       Worklist.Add(Cond);
       return &BI;
     }
@@ -1153,8 +1221,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
       ICmpInst *Cond = cast<ICmpInst>(BI.getCondition());
       Cond->setPredicate(ICmpInst::getInversePredicate(IPred));
       // Swap Destinations and condition.
-      BI.setSuccessor(0, FalseDest);
-      BI.setSuccessor(1, TrueDest);
+      BI.swapSuccessors();
       Worklist.Add(Cond);
       return &BI;
     }
@@ -1168,11 +1235,17 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     if (I->getOpcode() == Instruction::Add)
       if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
         // change 'switch (X+4) case 1:' into 'switch (X) case -3'
-        for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2)
-          SI.setOperand(i,
-                   ConstantExpr::getSub(cast<Constant>(SI.getOperand(i)),
-                                                AddRHS));
-        SI.setOperand(0, I->getOperand(0));
+        unsigned NumCases = SI.getNumCases();
+        // Skip the first item since that's the default case.
+        for (unsigned i = 1; i < NumCases; ++i) {
+          ConstantInt* CaseVal = SI.getCaseValue(i);
+          Constant* NewCaseVal = ConstantExpr::getSub(cast<Constant>(CaseVal),
+                                                      AddRHS);
+          assert(isa<ConstantInt>(NewCaseVal) &&
+                 "Result of expression should be constant");
+          SI.setSuccessorValue(i, cast<ConstantInt>(NewCaseVal));
+        }
+        SI.setCondition(I->getOperand(0));
         Worklist.Add(I);
         return &SI;
       }
@@ -1242,7 +1315,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       Value *NewEV = Builder->CreateExtractValue(IV->getAggregateOperand(),
                                                  EV.getIndices());
       return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
-                                     ArrayRef<unsigned>(insi, inse));
+                                     makeArrayRef(insi, inse));
     }
     if (insi == inse)
       // The insert list is a prefix of the extract list
@@ -1254,7 +1327,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // with
       // %E extractvalue { i32 } { i32 42 }, 0
       return ExtractValueInst::Create(IV->getInsertedValueOperand(), 
-                                      ArrayRef<unsigned>(exti, exte));
+                                      makeArrayRef(exti, exte));
   }
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) {
     // We're extracting from an intrinsic, see if we're the only user, which
@@ -1310,7 +1383,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
     // load from a GEP. This reduces the size of the load.
     // FIXME: If a load is used only by extractvalue instructions then this
     //        could be done regardless of having multiple uses.
-    if (!L->isVolatile() && L->hasOneUse()) {
+    if (L->isSimple() && L->hasOneUse()) {
       // extractvalue has integer indices, getelementptr has Value*s. Convert.
       SmallVector<Value*, 4> Indices;
       // Prefix an i32 0 since we need the first element.
@@ -1322,8 +1395,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
       // We need to insert these at the location of the old load, not at that of
       // the extractvalue.
       Builder->SetInsertPoint(L->getParent(), L);
-      Value *GEP = Builder->CreateInBoundsGEP(L->getPointerOperand(),
-                                              Indices.begin(), Indices.end());
+      Value *GEP = Builder->CreateInBoundsGEP(L->getPointerOperand(), Indices);
       // Returning the load directly will cause the main loop to insert it in
       // the wrong spot, so use ReplaceInstUsesWith().
       return ReplaceInstUsesWith(EV, Builder->CreateLoad(GEP));
@@ -1339,6 +1411,342 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) {
   return 0;
 }
 
+enum Personality_Type {
+  Unknown_Personality,
+  GNU_Ada_Personality,
+  GNU_CXX_Personality
+};
+
+/// RecognizePersonality - See if the given exception handling personality
+/// function is one that we understand.  If so, return a description of it;
+/// otherwise return Unknown_Personality.
+static Personality_Type RecognizePersonality(Value *Pers) {
+  Function *F = dyn_cast<Function>(Pers->stripPointerCasts());
+  if (!F)
+    return Unknown_Personality;
+  return StringSwitch<Personality_Type>(F->getName())
+    .Case("__gnat_eh_personality", GNU_Ada_Personality)
+    .Case("__gxx_personality_v0", GNU_CXX_Personality)
+    .Default(Unknown_Personality);
+}
+
+/// isCatchAll - Return 'true' if the given typeinfo will match anything.
+static bool isCatchAll(Personality_Type Personality, Constant *TypeInfo) {
+  switch (Personality) {
+  case Unknown_Personality:
+    return false;
+  case GNU_Ada_Personality:
+    // While __gnat_all_others_value will match any Ada exception, it doesn't
+    // match foreign exceptions (or didn't, before gcc-4.7).
+    return false;
+  case GNU_CXX_Personality:
+    return TypeInfo->isNullValue();
+  }
+  llvm_unreachable("Unknown personality!");
+}
+
+static bool shorter_filter(const Value *LHS, const Value *RHS) {
+  return
+    cast<ArrayType>(LHS->getType())->getNumElements()
+  <
+    cast<ArrayType>(RHS->getType())->getNumElements();
+}
+
+Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
+  // The logic here should be correct for any real-world personality function.
+  // However if that turns out not to be true, the offending logic can always
+  // be conditioned on the personality function, like the catch-all logic is.
+  Personality_Type Personality = RecognizePersonality(LI.getPersonalityFn());
+
+  // Simplify the list of clauses, eg by removing repeated catch clauses
+  // (these are often created by inlining).
+  bool MakeNewInstruction = false; // If true, recreate using the following:
+  SmallVector<Value *, 16> NewClauses; // - Clauses for the new instruction;
+  bool CleanupFlag = LI.isCleanup();   // - The new instruction is a cleanup.
+
+  SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
+  for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
+    bool isLastClause = i + 1 == e;
+    if (LI.isCatch(i)) {
+      // A catch clause.
+      Value *CatchClause = LI.getClause(i);
+      Constant *TypeInfo = cast<Constant>(CatchClause->stripPointerCasts());
+
+      // If we already saw this clause, there is no point in having a second
+      // copy of it.
+      if (AlreadyCaught.insert(TypeInfo)) {
+        // This catch clause was not already seen.
+        NewClauses.push_back(CatchClause);
+      } else {
+        // Repeated catch clause - drop the redundant copy.
+        MakeNewInstruction = true;
+      }
+
+      // If this is a catch-all then there is no point in keeping any following
+      // clauses or marking the landingpad as having a cleanup.
+      if (isCatchAll(Personality, TypeInfo)) {
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+    } else {
+      // A filter clause.  If any of the filter elements were already caught
+      // then they can be dropped from the filter.  It is tempting to try to
+      // exploit the filter further by saying that any typeinfo that does not
+      // occur in the filter can't be caught later (and thus can be dropped).
+      // However this would be wrong, since typeinfos can match without being
+      // equal (for example if one represents a C++ class, and the other some
+      // class derived from it).
+      assert(LI.isFilter(i) && "Unsupported landingpad clause!");
+      Value *FilterClause = LI.getClause(i);
+      ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
+      unsigned NumTypeInfos = FilterType->getNumElements();
+
+      // An empty filter catches everything, so there is no point in keeping any
+      // following clauses or marking the landingpad as having a cleanup.  By
+      // dealing with this case here the following code is made a bit simpler.
+      if (!NumTypeInfos) {
+        NewClauses.push_back(FilterClause);
+        if (!isLastClause)
+          MakeNewInstruction = true;
+        CleanupFlag = false;
+        break;
+      }
+
+      bool MakeNewFilter = false; // If true, make a new filter.
+      SmallVector<Constant *, 16> NewFilterElts; // New elements.
+      if (isa<ConstantAggregateZero>(FilterClause)) {
+        // Not an empty filter - it contains at least one null typeinfo.
+        assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
+        Constant *TypeInfo =
+          Constant::getNullValue(FilterType->getElementType());
+        // If this typeinfo is a catch-all then the filter can never match.
+        if (isCatchAll(Personality, TypeInfo)) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // There is no point in having multiple copies of this typeinfo, so
+        // discard all but the first copy if there is more than one.
+        NewFilterElts.push_back(TypeInfo);
+        if (NumTypeInfos > 1)
+          MakeNewFilter = true;
+      } else {
+        ConstantArray *Filter = cast<ConstantArray>(FilterClause);
+        SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
+        NewFilterElts.reserve(NumTypeInfos);
+
+        // Remove any filter elements that were already caught or that already
+        // occurred in the filter.  While there, see if any of the elements are
+        // catch-alls.  If so, the filter can be discarded.
+        bool SawCatchAll = false;
+        for (unsigned j = 0; j != NumTypeInfos; ++j) {
+          Value *Elt = Filter->getOperand(j);
+          Constant *TypeInfo = cast<Constant>(Elt->stripPointerCasts());
+          if (isCatchAll(Personality, TypeInfo)) {
+            // This element is a catch-all.  Bail out, noting this fact.
+            SawCatchAll = true;
+            break;
+          }
+          if (AlreadyCaught.count(TypeInfo))
+            // Already caught by an earlier clause, so having it in the filter
+            // is pointless.
+            continue;
+          // There is no point in having multiple copies of the same typeinfo in
+          // a filter, so only add it if we didn't already.
+          if (SeenInFilter.insert(TypeInfo))
+            NewFilterElts.push_back(cast<Constant>(Elt));
+        }
+        // A filter containing a catch-all cannot match anything by definition.
+        if (SawCatchAll) {
+          // Throw the filter away.
+          MakeNewInstruction = true;
+          continue;
+        }
+
+        // If we dropped something from the filter, make a new one.
+        if (NewFilterElts.size() < NumTypeInfos)
+          MakeNewFilter = true;
+      }
+      if (MakeNewFilter) {
+        FilterType = ArrayType::get(FilterType->getElementType(),
+                                    NewFilterElts.size());
+        FilterClause = ConstantArray::get(FilterType, NewFilterElts);
+        MakeNewInstruction = true;
+      }
+
+      NewClauses.push_back(FilterClause);
+
+      // If the new filter is empty then it will catch everything so there is
+      // no point in keeping any following clauses or marking the landingpad
+      // as having a cleanup.  The case of the original filter being empty was
+      // already handled above.
+      if (MakeNewFilter && !NewFilterElts.size()) {
+        assert(MakeNewInstruction && "New filter but not a new instruction!");
+        CleanupFlag = false;
+        break;
+      }
+    }
+  }
+
+  // If several filters occur in a row then reorder them so that the shortest
+  // filters come first (those with the smallest number of elements).  This is
+  // advantageous because shorter filters are more likely to match, speeding up
+  // unwinding, but mostly because it increases the effectiveness of the other
+  // filter optimizations below.
+  for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
+    unsigned j;
+    // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
+    for (j = i; j != e; ++j)
+      if (!isa<ArrayType>(NewClauses[j]->getType()))
+        break;
+
+    // Check whether the filters are already sorted by length.  We need to know
+    // if sorting them is actually going to do anything so that we only make a
+    // new landingpad instruction if it does.
+    for (unsigned k = i; k + 1 < j; ++k)
+      if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
+        // Not sorted, so sort the filters now.  Doing an unstable sort would be
+        // correct too but reordering filters pointlessly might confuse users.
+        std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
+                         shorter_filter);
+        MakeNewInstruction = true;
+        break;
+      }
+
+    // Look for the next batch of filters.
+    i = j + 1;
+  }
+
+  // If typeinfos matched if and only if equal, then the elements of a filter L
+  // that occurs later than a filter F could be replaced by the intersection of
+  // the elements of F and L.  In reality two typeinfos can match without being
+  // equal (for example if one represents a C++ class, and the other some class
+  // derived from it) so it would be wrong to perform this transform in general.
+  // However the transform is correct and useful if F is a subset of L.  In that
+  // case L can be replaced by F, and thus removed altogether since repeating a
+  // filter is pointless.  So here we look at all pairs of filters F and L where
+  // L follows F in the list of clauses, and remove L if every element of F is
+  // an element of L.  This can occur when inlining C++ functions with exception
+  // specifications.
+  for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
+    // Examine each filter in turn.
+    Value *Filter = NewClauses[i];
+    ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
+    if (!FTy)
+      // Not a filter - skip it.
+      continue;
+    unsigned FElts = FTy->getNumElements();
+    // Examine each filter following this one.  Doing this backwards means that
+    // we don't have to worry about filters disappearing under us when removed.
+    for (unsigned j = NewClauses.size() - 1; j != i; --j) {
+      Value *LFilter = NewClauses[j];
+      ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
+      if (!LTy)
+        // Not a filter - skip it.
+        continue;
+      // If Filter is a subset of LFilter, i.e. every element of Filter is also
+      // an element of LFilter, then discard LFilter.
+      SmallVector<Value *, 16>::iterator J = NewClauses.begin() + j;
+      // If Filter is empty then it is a subset of LFilter.
+      if (!FElts) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+        // Move on to the next filter.
+        continue;
+      }
+      unsigned LElts = LTy->getNumElements();
+      // If Filter is longer than LFilter then it cannot be a subset of it.
+      if (FElts > LElts)
+        // Move on to the next filter.
+        continue;
+      // At this point we know that LFilter has at least one element.
+      if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
+        // Filter is a subset of LFilter iff Filter contains only zeros (as we
+        // already know that Filter is not longer than LFilter).
+        if (isa<ConstantAggregateZero>(Filter)) {
+          assert(FElts <= LElts && "Should have handled this case earlier!");
+          // Discard LFilter.
+          NewClauses.erase(J);
+          MakeNewInstruction = true;
+        }
+        // Move on to the next filter.
+        continue;
+      }
+      ConstantArray *LArray = cast<ConstantArray>(LFilter);
+      if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
+        // Since Filter is non-empty and contains only zeros, it is a subset of
+        // LFilter iff LFilter contains a zero.
+        assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
+        for (unsigned l = 0; l != LElts; ++l)
+          if (LArray->getOperand(l)->isNullValue()) {
+            // LFilter contains a zero - discard it.
+            NewClauses.erase(J);
+            MakeNewInstruction = true;
+            break;
+          }
+        // Move on to the next filter.
+        continue;
+      }
+      // At this point we know that both filters are ConstantArrays.  Loop over
+      // operands to see whether every element of Filter is also an element of
+      // LFilter.  Since filters tend to be short this is probably faster than
+      // using a method that scales nicely.
+      ConstantArray *FArray = cast<ConstantArray>(Filter);
+      bool AllFound = true;
+      for (unsigned f = 0; f != FElts; ++f) {
+        Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
+        AllFound = false;
+        for (unsigned l = 0; l != LElts; ++l) {
+          Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
+          if (LTypeInfo == FTypeInfo) {
+            AllFound = true;
+            break;
+          }
+        }
+        if (!AllFound)
+          break;
+      }
+      if (AllFound) {
+        // Discard LFilter.
+        NewClauses.erase(J);
+        MakeNewInstruction = true;
+      }
+      // Move on to the next filter.
+    }
+  }
+
+  // If we changed any of the clauses, replace the old landingpad instruction
+  // with a new one.
+  if (MakeNewInstruction) {
+    LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
+                                                 LI.getPersonalityFn(),
+                                                 NewClauses.size());
+    for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
+      NLI->addClause(NewClauses[i]);
+    // A landing pad with no clauses must have the cleanup flag set.  It is
+    // theoretically possible, though highly unlikely, that we eliminated all
+    // clauses.  If so, force the cleanup flag to true.
+    if (NewClauses.empty())
+      CleanupFlag = true;
+    NLI->setCleanup(CleanupFlag);
+    return NLI;
+  }
+
+  // Even if none of the clauses changed, we may nonetheless have understood
+  // that the cleanup flag is pointless.  Clear it if so.
+  if (LI.isCleanup() != CleanupFlag) {
+    assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
+    LI.setCleanup(CleanupFlag);
+    return &LI;
+  }
+
+  return 0;
+}
+
 
 
 
@@ -1350,7 +1758,8 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   assert(I->hasOneUse() && "Invariants didn't hold!");
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
-  if (isa<PHINode>(I) || I->mayHaveSideEffects() || isa<TerminatorInst>(I))
+  if (isa<PHINode>(I) || isa<LandingPadInst>(I) || I->mayHaveSideEffects() ||
+      isa<TerminatorInst>(I))
     return false;
 
   // Do not sink alloca instructions out of the entry block.
@@ -1367,8 +1776,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
         return false;
   }
 
-  BasicBlock::iterator InsertPos = DestBlock->getFirstNonPHI();
-
+  BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
   I->moveBefore(InsertPos);
   ++NumSunkInst;
   return true;
@@ -1503,27 +1911,29 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     // Do a quick scan over the function.  If we find any blocks that are
     // unreachable, remove any instructions inside of them.  This prevents
     // the instcombine code from having to deal with some bad special cases.
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      if (!Visited.count(BB)) {
-        Instruction *Term = BB->getTerminator();
-        while (Term != BB->begin()) {   // Remove instrs bottom-up
-          BasicBlock::iterator I = Term; --I;
-
-          DEBUG(errs() << "IC: DCE: " << *I << '\n');
-          // A debug intrinsic shouldn't force another iteration if we weren't
-          // going to do one without it.
-          if (!isa<DbgInfoIntrinsic>(I)) {
-            ++NumDeadInst;
-            MadeIRChange = true;
-          }
-
-          // If I is not void type then replaceAllUsesWith undef.
-          // This allows ValueHandlers and custom metadata to adjust itself.
-          if (!I->getType()->isVoidTy())
-            I->replaceAllUsesWith(UndefValue::get(I->getType()));
-          I->eraseFromParent();
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      if (Visited.count(BB)) continue;
+
+      // Delete the instructions backwards, as it has a reduced likelihood of
+      // having to update as many def-use and use-def chains.
+      Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+      while (EndInst != BB->begin()) {
+        // Delete the next to last instruction.
+        BasicBlock::iterator I = EndInst;
+        Instruction *Inst = --I;
+        if (!Inst->use_empty())
+          Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+        if (isa<LandingPadInst>(Inst)) {
+          EndInst = Inst;
+          continue;
         }
+        if (!isa<DbgInfoIntrinsic>(Inst)) {
+          ++NumDeadInst;
+          MadeIRChange = true;
+        }
+        Inst->eraseFromParent();
       }
+    }
   }
 
   while (!Worklist.isEmpty()) {
@@ -1604,13 +2014,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
         // Everything uses the new instruction now.
         I->replaceAllUsesWith(Result);
 
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
         // Push the new instruction and any users onto the worklist.
         Worklist.Add(Result);
         Worklist.AddUsersToWorkList(*Result);
 
-        // Move the name to the new instruction first.
-        Result->takeName(I);
-
         // Insert the new instruction into the basic block...
         BasicBlock *InstParent = I->getParent();
         BasicBlock::iterator InsertPos = I;
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 5700ac8..7b3a927a 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -6,3 +6,10 @@ add_llvm_library(LLVMInstrumentation
   PathProfiling.cpp
   ProfilingUtils.cpp
   )
+
+add_llvm_library_dependencies(LLVMInstrumentation
+  LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  LLVMTransformUtils
+  )
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
index 1d31fcc..e8ef265 100644
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -74,7 +74,7 @@ bool EdgeProfiler::runOnModule(Module &M) {
     }
   }
 
-  const Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
+  Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
   GlobalVariable *Counters =
     new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
                        Constant::getNullValue(ATy), "EdgeProfCounters");
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 3f2c412..ccf7e11 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -60,11 +60,11 @@ namespace {
     bool runOnModule(Module &M);
 
     // Create the GCNO files for the Module based on DebugInfo.
-    void emitGCNO(DebugInfoFinder &DIF);
+    void emitGCNO();
 
     // Modify the program to track transitions along edges and call into the
     // profiling runtime to emit .gcda files when run.
-    bool emitProfileArcs(DebugInfoFinder &DIF);
+    bool emitProfileArcs();
 
     // Get pointers to the functions in the runtime library.
     Constant *getStartFileFunc();
@@ -86,8 +86,7 @@ namespace {
 
     // Add the function to write out all our counters to the global destructor
     // list.
-    void insertCounterWriteout(DebugInfoFinder &,
-                               SmallVector<std::pair<GlobalVariable *,
+    void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *,
                                                      MDNode *>, 8> &);
 
     std::string mangleName(DICompileUnit CU, std::string NewStem);
@@ -110,15 +109,6 @@ ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
   return new GCOVProfiler(EmitNotes, EmitData, Use402Format);
 }
 
-static DISubprogram findSubprogram(DIScope Scope) {
-  while (!Scope.isSubprogram()) {
-    assert(Scope.isLexicalBlock() &&
-           "Debug location not lexical block or subprogram");
-    Scope = DILexicalBlock(Scope).getContext();
-  }
-  return DISubprogram(Scope);
-}
-
 namespace {
   class GCOVRecord {
    protected:
@@ -177,18 +167,24 @@ namespace {
     }
 
     uint32_t length() {
+      // Here 2 = 1 for string lenght + 1 for '0' id#.
       return lengthOfGCOVString(Filename) + 2 + Lines.size();
     }
 
-   private:
-    friend class GCOVBlock;
+    void writeOut() {
+      write(0);
+      writeGCOVString(Filename);
+      for (int i = 0, e = Lines.size(); i != e; ++i)
+        write(Lines[i]);
+    }
 
-    GCOVLines(std::string Filename, raw_ostream *os)
-        : Filename(Filename) {
+    GCOVLines(StringRef F, raw_ostream *os) 
+      : Filename(F) {
       this->os = os;
     }
 
-    std::string Filename;
+   private:
+    StringRef Filename;
     SmallVector<uint32_t, 32> Lines;
   };
 
@@ -197,7 +193,7 @@ namespace {
   // other blocks.
   class GCOVBlock : public GCOVRecord {
    public:
-    GCOVLines &getFile(std::string Filename) {
+    GCOVLines &getFile(StringRef Filename) {
       GCOVLines *&Lines = LinesByFile[Filename];
       if (!Lines) {
         Lines = new GCOVLines(Filename, os);
@@ -220,13 +216,8 @@ namespace {
       write(Len);
       write(Number);
       for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
-               E = LinesByFile.end(); I != E; ++I) {
-        write(0);
-        writeGCOVString(I->second->Filename);
-        for (int i = 0, e = I->second->Lines.size(); i != e; ++i) {
-          write(I->second->Lines[i]);
-        }
-      }
+               E = LinesByFile.end(); I != E; ++I) 
+        I->second->writeOut();
       write(0);
       write(0);
     }
@@ -353,66 +344,66 @@ bool GCOVProfiler::runOnModule(Module &M) {
   this->M = &M;
   Ctx = &M.getContext();
 
-  DebugInfoFinder DIF;
-  DIF.processModule(M);
-
-  if (EmitNotes) emitGCNO(DIF);
-  if (EmitData) return emitProfileArcs(DIF);
+  if (EmitNotes) emitGCNO();
+  if (EmitData) return emitProfileArcs();
   return false;
 }
 
-void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) {
+void GCOVProfiler::emitGCNO() {
   DenseMap<const MDNode *, raw_fd_ostream *> GcnoFiles;
-  for (DebugInfoFinder::iterator I = DIF.compile_unit_begin(),
-           E = DIF.compile_unit_end(); I != E; ++I) {
-    // Each compile unit gets its own .gcno file. This means that whether we run
-    // this pass over the original .o's as they're produced, or run it after
-    // LTO, we'll generate the same .gcno files.
-
-    DICompileUnit CU(*I);
-    raw_fd_ostream *&out = GcnoFiles[CU];
-    std::string ErrorInfo;
-    out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo,
-                             raw_fd_ostream::F_Binary);
-    if (!Use402Format)
-      out->write("oncg*404MVLL", 12);
-    else
-      out->write("oncg*402MVLL", 12);
-  }
-
-  for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
-           SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) {
-    DISubprogram SP(*SPI);
-    raw_fd_ostream *&os = GcnoFiles[SP.getCompileUnit()];
-
-    Function *F = SP.getFunction();
-    if (!F) continue;
-    GCOVFunction Func(SP, os, Use402Format);
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      GCOVBlock &Block = Func.getBlock(BB);
-      TerminatorInst *TI = BB->getTerminator();
-      if (int successors = TI->getNumSuccessors()) {
-        for (int i = 0; i != successors; ++i) {
-          Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      // Each compile unit gets its own .gcno file. This means that whether we run
+      // this pass over the original .o's as they're produced, or run it after
+      // LTO, we'll generate the same .gcno files.
+      
+      DICompileUnit CU(CU_Nodes->getOperand(i));
+      raw_fd_ostream *&out = GcnoFiles[CU];
+      std::string ErrorInfo;
+      out = new raw_fd_ostream(mangleName(CU, "gcno").c_str(), ErrorInfo,
+                               raw_fd_ostream::F_Binary);
+      if (!Use402Format)
+        out->write("oncg*404MVLL", 12);
+      else
+        out->write("oncg*204MVLL", 12);
+  
+      DIArray SPs = CU.getSubprograms();
+      for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+        DISubprogram SP(SPs.getElement(i));
+        if (!SP.Verify()) continue;
+        raw_fd_ostream *&os = GcnoFiles[CU];
+        
+        Function *F = SP.getFunction();
+        if (!F) continue;
+        GCOVFunction Func(SP, os, Use402Format);
+        
+        for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+          GCOVBlock &Block = Func.getBlock(BB);
+          TerminatorInst *TI = BB->getTerminator();
+          if (int successors = TI->getNumSuccessors()) {
+            for (int i = 0; i != successors; ++i) {
+              Block.addEdge(Func.getBlock(TI->getSuccessor(i)));
+            }
+          } else if (isa<ReturnInst>(TI)) {
+            Block.addEdge(Func.getReturnBlock());
+          }
+          
+          uint32_t Line = 0;
+          for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+            const DebugLoc &Loc = I->getDebugLoc();
+            if (Loc.isUnknown()) continue;
+            if (Line == Loc.getLine()) continue;
+            Line = Loc.getLine();
+            if (SP != getDISubprogram(Loc.getScope(*Ctx))) continue;
+            
+            GCOVLines &Lines = Block.getFile(SP.getFilename());
+            Lines.addLine(Loc.getLine());
+          }
         }
-      } else if (isa<ReturnInst>(TI)) {
-        Block.addEdge(Func.getReturnBlock());
-      }
-
-      uint32_t Line = 0;
-      for (BasicBlock::iterator I = BB->begin(), IE = BB->end(); I != IE; ++I) {
-        const DebugLoc &Loc = I->getDebugLoc();
-        if (Loc.isUnknown()) continue;
-        if (Line == Loc.getLine()) continue;
-        Line = Loc.getLine();
-        if (SP != findSubprogram(DIScope(Loc.getScope(*Ctx)))) continue;
-
-        GCOVLines &Lines = Block.getFile(SP.getFilename());
-        Lines.addLine(Loc.getLine());
+        Func.writeOut();
       }
     }
-    Func.writeOut();
   }
 
   for (DenseMap<const MDNode *, raw_fd_ostream *>::iterator
@@ -424,103 +415,107 @@ void GCOVProfiler::emitGCNO(DebugInfoFinder &DIF) {
   }
 }
 
-bool GCOVProfiler::emitProfileArcs(DebugInfoFinder &DIF) {
-  if (DIF.subprogram_begin() == DIF.subprogram_end())
-    return false;
-
-  SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
-  for (DebugInfoFinder::iterator SPI = DIF.subprogram_begin(),
-           SPE = DIF.subprogram_end(); SPI != SPE; ++SPI) {
-    DISubprogram SP(*SPI);
-    Function *F = SP.getFunction();
-    if (!F) continue;
-
-    unsigned Edges = 0;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      if (isa<ReturnInst>(TI))
-        ++Edges;
-      else
-        Edges += TI->getNumSuccessors();
-    }
-
-    const ArrayType *CounterTy =
+bool GCOVProfiler::emitProfileArcs() {
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes) return false;
+
+  bool Result = false;  
+  for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+    DICompileUnit CU(CU_Nodes->getOperand(i));
+    DIArray SPs = CU.getSubprograms();
+    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram SP(SPs.getElement(i));
+      if (!SP.Verify()) continue;
+      Function *F = SP.getFunction();
+      if (!F) continue;
+      if (!Result) Result = true;
+      unsigned Edges = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        if (isa<ReturnInst>(TI))
+          ++Edges;
+        else
+          Edges += TI->getNumSuccessors();
+      }
+      
+      ArrayType *CounterTy =
         ArrayType::get(Type::getInt64Ty(*Ctx), Edges);
-    GlobalVariable *Counters =
+      GlobalVariable *Counters =
         new GlobalVariable(*M, CounterTy, false,
                            GlobalValue::InternalLinkage,
                            Constant::getNullValue(CounterTy),
                            "__llvm_gcov_ctr", 0, false, 0);
-    CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
-
-    UniqueVector<BasicBlock *> ComplexEdgePreds;
-    UniqueVector<BasicBlock *> ComplexEdgeSuccs;
-
-    unsigned Edge = 0;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
-      if (Successors) {
-        IRBuilder<> Builder(TI);
-
-        if (Successors == 1) {
-          Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
-                                                              Edge);
-          Value *Count = Builder.CreateLoad(Counter);
-          Count = Builder.CreateAdd(Count,
-                                    ConstantInt::get(Type::getInt64Ty(*Ctx),1));
-          Builder.CreateStore(Count, Counter);
-        } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-          Value *Sel = Builder.CreateSelect(
+      CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP));
+      
+      UniqueVector<BasicBlock *> ComplexEdgePreds;
+      UniqueVector<BasicBlock *> ComplexEdgeSuccs;
+      
+      unsigned Edge = 0;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+        TerminatorInst *TI = BB->getTerminator();
+        int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
+        if (Successors) {
+          IRBuilder<> Builder(TI);
+          
+          if (Successors == 1) {
+            Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
+                                                                Edge);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count,
+                                      ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+            Builder.CreateStore(Count, Counter);
+          } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            Value *Sel = Builder.CreateSelect(
               BI->getCondition(),
               ConstantInt::get(Type::getInt64Ty(*Ctx), Edge),
               ConstantInt::get(Type::getInt64Ty(*Ctx), Edge + 1));
-          SmallVector<Value *, 2> Idx;
-          Idx.push_back(Constant::getNullValue(Type::getInt64Ty(*Ctx)));
-          Idx.push_back(Sel);
-          Value *Counter = Builder.CreateInBoundsGEP(Counters,
-                                                     Idx.begin(), Idx.end());
-          Value *Count = Builder.CreateLoad(Counter);
-          Count = Builder.CreateAdd(Count,
-                                    ConstantInt::get(Type::getInt64Ty(*Ctx),1));
-          Builder.CreateStore(Count, Counter);
-        } else {
-          ComplexEdgePreds.insert(BB);
-          for (int i = 0; i != Successors; ++i)
-            ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+            SmallVector<Value *, 2> Idx;
+            Idx.push_back(Constant::getNullValue(Type::getInt64Ty(*Ctx)));
+            Idx.push_back(Sel);
+            Value *Counter = Builder.CreateInBoundsGEP(Counters, Idx);
+            Value *Count = Builder.CreateLoad(Counter);
+            Count = Builder.CreateAdd(Count,
+                                      ConstantInt::get(Type::getInt64Ty(*Ctx),1));
+            Builder.CreateStore(Count, Counter);
+          } else {
+            ComplexEdgePreds.insert(BB);
+            for (int i = 0; i != Successors; ++i)
+              ComplexEdgeSuccs.insert(TI->getSuccessor(i));
+          }
+          Edge += Successors;
         }
-        Edge += Successors;
       }
-    }
-
-    if (!ComplexEdgePreds.empty()) {
-      GlobalVariable *EdgeTable =
+      
+      if (!ComplexEdgePreds.empty()) {
+        GlobalVariable *EdgeTable =
           buildEdgeLookupTable(F, Counters,
                                ComplexEdgePreds, ComplexEdgeSuccs);
-      GlobalVariable *EdgeState = getEdgeStateValue();
-
-      const Type *Int32Ty = Type::getInt32Ty(*Ctx);
-      for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
-        IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
-        Builder.CreateStore(ConstantInt::get(Int32Ty, i), EdgeState);
-      }
-      for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
-        // call runtime to perform increment
-        IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstNonPHI());
-        Value *CounterPtrArray =
+        GlobalVariable *EdgeState = getEdgeStateValue();
+        
+        Type *Int32Ty = Type::getInt32Ty(*Ctx);
+        for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
+          IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
+          Builder.CreateStore(ConstantInt::get(Int32Ty, i), EdgeState);
+        }
+        for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
+          // call runtime to perform increment
+          BasicBlock::iterator InsertPt =
+            ComplexEdgeSuccs[i+1]->getFirstInsertionPt();
+          IRBuilder<> Builder(InsertPt);
+          Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
-        Builder.CreateCall2(getIncrementIndirectCounterFunc(),
-                            EdgeState, CounterPtrArray);
-        // clear the predecessor number
-        Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState);
+          Builder.CreateCall2(getIncrementIndirectCounterFunc(),
+                              EdgeState, CounterPtrArray);
+          // clear the predecessor number
+          Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState);
+        }
       }
     }
+    insertCounterWriteout(CountersBySP);
   }
-
-  insertCounterWriteout(DIF, CountersBySP);
-
-  return true;
+  return Result;
 }
 
 // All edges with successors that aren't branches are "complex", because it
@@ -535,8 +530,8 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
   // read it. Threads and invoke make this untrue.
 
   // emit [(succs * preds) x i64*], logically [succ x [pred x i64*]].
-  const Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
-  const ArrayType *EdgeTableTy = ArrayType::get(
+  Type *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
+  ArrayType *EdgeTableTy = ArrayType::get(
       Int64PtrTy, Succs.size() * Preds.size());
 
   Constant **EdgeTable = new Constant*[Succs.size() * Preds.size()];
@@ -572,7 +567,7 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
 }
 
 Constant *GCOVProfiler::getStartFileFunc() {
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Type::getInt8PtrTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
 }
@@ -582,7 +577,7 @@ Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
     Type::getInt32PtrTy(*Ctx),                  // uint32_t *predecessor
     Type::getInt64PtrTy(*Ctx)->getPointerTo(),  // uint64_t **state_table_row
   };
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Args, false);
   return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy);
 }
@@ -592,7 +587,7 @@ Constant *GCOVProfiler::getEmitFunctionFunc() {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
   };
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
 }
@@ -602,13 +597,13 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
     Type::getInt32Ty(*Ctx),     // uint32_t num_counters
     Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
   };
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
                                               Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
 Constant *GCOVProfiler::getEndFileFunc() {
-  const FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
 }
 
@@ -626,9 +621,8 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
 }
 
 void GCOVProfiler::insertCounterWriteout(
-    DebugInfoFinder &DIF,
     SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) {
-  const FunctionType *WriteoutFTy =
+  FunctionType *WriteoutFTy =
       FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *WriteoutF = Function::Create(WriteoutFTy,
                                          GlobalValue::InternalLinkage,
@@ -642,29 +636,31 @@ void GCOVProfiler::insertCounterWriteout(
   Constant *EmitArcs = getEmitArcsFunc();
   Constant *EndFile = getEndFileFunc();
 
-  for (DebugInfoFinder::iterator CUI = DIF.compile_unit_begin(),
-           CUE = DIF.compile_unit_end(); CUI != CUE; ++CUI) {
-    DICompileUnit compile_unit(*CUI);
-    std::string FilenameGcda = mangleName(compile_unit, "gcda");
-    Builder.CreateCall(StartFile,
-                       Builder.CreateGlobalStringPtr(FilenameGcda));
-    for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
+  NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
+  if (CU_Nodes) {
+    for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
+      DICompileUnit compile_unit(CU_Nodes->getOperand(i));
+      std::string FilenameGcda = mangleName(compile_unit, "gcda");
+      Builder.CreateCall(StartFile,
+                         Builder.CreateGlobalStringPtr(FilenameGcda));
+      for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
              I = CountersBySP.begin(), E = CountersBySP.end();
-         I != E; ++I) {
-      DISubprogram SP(I->second);
-      intptr_t ident = reinterpret_cast<intptr_t>(I->second);
-      Builder.CreateCall2(EmitFunction,
-                          ConstantInt::get(Type::getInt32Ty(*Ctx), ident),
-                          Builder.CreateGlobalStringPtr(SP.getName()));
-                                                        
-      GlobalVariable *GV = I->first;
-      unsigned Arcs =
+           I != E; ++I) {
+        DISubprogram SP(I->second);
+        intptr_t ident = reinterpret_cast<intptr_t>(I->second);
+        Builder.CreateCall2(EmitFunction,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx), ident),
+                            Builder.CreateGlobalStringPtr(SP.getName()));
+        
+        GlobalVariable *GV = I->first;
+        unsigned Arcs =
           cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
-      Builder.CreateCall2(EmitArcs,
-                          ConstantInt::get(Type::getInt32Ty(*Ctx), Arcs),
-                          Builder.CreateConstGEP2_64(GV, 0, 0));
+        Builder.CreateCall2(EmitArcs,
+                            ConstantInt::get(Type::getInt32Ty(*Ctx), Arcs),
+                            Builder.CreateConstGEP2_64(GV, 0, 0));
+      }
+      Builder.CreateCall(EndFile);
     }
-    Builder.CreateCall(EndFile);
   }
   Builder.CreateRetVoid();
 
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index e09f882..62c21b8 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -112,8 +112,8 @@ bool OptimalEdgeProfiler::runOnModule(Module &M) {
   // be calculated from other edge counters on reading the profile info back
   // in.
 
-  const Type *Int32 = Type::getInt32Ty(M.getContext());
-  const ArrayType *ATy = ArrayType::get(Int32, NumEdges);
+  Type *Int32 = Type::getInt32Ty(M.getContext());
+  ArrayType *ATy = ArrayType::get(Int32, NumEdges);
   GlobalVariable *Counters =
     new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
                        Constant::getNullValue(ATy), "OptEdgeProfCounters");
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 7541663..23915d3 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -374,7 +374,7 @@ namespace llvm {
   template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable,
                                             xcompile> {
   public:
-    static const StructType *get(LLVMContext& C) {
+    static StructType *get(LLVMContext& C) {
       return( StructType::get(
                 TypeBuilder<types::i<32>, xcompile>::get(C), // type
                 TypeBuilder<types::i<32>, xcompile>::get(C), // array size
@@ -909,7 +909,7 @@ BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
                                                      pathNumber) {
   if(pathNumber == NULL || isa<ConstantInt>(pathNumber)
      || (((Instruction*)(pathNumber))->getParent()) != block) {
-    return(block->getFirstNonPHI());
+    return(block->getFirstInsertionPt());
   } else {
     Instruction* pathNumberInst = (Instruction*) (pathNumber);
     BasicBlock::iterator insertPoint;
@@ -930,7 +930,7 @@ BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
 // A PHINode is created in the node, and its values initialized to -1U.
 void PathProfiler::preparePHI(BLInstrumentationNode* node) {
   BasicBlock* block = node->getBlock();
-  BasicBlock::iterator insertPoint = block->getFirstNonPHI();
+  BasicBlock::iterator insertPoint = block->getFirstInsertionPt();
   pred_iterator PB = pred_begin(node->getBlock()),
           PE = pred_end(node->getBlock());
   PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context),
@@ -999,7 +999,7 @@ void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node,
   BasicBlock::iterator insertPoint;
 
   if( atBeginning )
-    insertPoint = block->getFirstNonPHI();
+    insertPoint = block->getFirstInsertionPt();
   else
     insertPoint = block->getTerminator();
 
@@ -1029,8 +1029,7 @@ void PathProfiler::insertCounterIncrement(Value* incValue,
     gepIndices[1] = incValue;
 
     GetElementPtrInst* pcPointer =
-      GetElementPtrInst::Create(dag->getCounterArray(),
-                                gepIndices.begin(), gepIndices.end(),
+      GetElementPtrInst::Create(dag->getCounterArray(), gepIndices,
                                 "counterInc", insertPoint);
 
     // Load from the array - call it oldPC
@@ -1140,7 +1139,7 @@ void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
     }
 
     BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstNonPHI() :
+      instrumentNode->getBlock()->getFirstInsertionPt() :
       instrumentNode->getBlock()->getTerminator();
 
     // add information from the bottom edge, if it exists
@@ -1172,7 +1171,7 @@ void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
   // Insert instrumentation if this is a normal edge
   else {
     BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstNonPHI() :
+      instrumentNode->getBlock()->getFirstInsertionPt() :
       instrumentNode->getBlock()->getTerminator();
 
     if( edge->isInitialization() ) { // initialize path number
@@ -1233,7 +1232,7 @@ void PathProfiler::insertInstrumentation(
          end = callEdges.end(); edge != end; edge++ ) {
     BLInstrumentationNode* node =
       (BLInstrumentationNode*)(*edge)->getSource();
-    BasicBlock::iterator insertPoint = node->getBlock()->getFirstNonPHI();
+    BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt();
 
     // Find the first function call
     while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call )
@@ -1289,7 +1288,7 @@ void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
 
   // Should we store the information in an array or hash
   if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    const Type* t = ArrayType::get(Type::getInt32Ty(*Context),
+    Type* t = ArrayType::get(Type::getInt32Ty(*Context),
                                    dag.getNumberOfPaths());
 
     dag.setCounterArray(new GlobalVariable(M, t, false,
@@ -1301,7 +1300,7 @@ void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
 
   // Add to global function reference table
   unsigned type;
-  const Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
+  Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
 
   if( dag.getNumberOfPaths() <= HASH_THRESHHOLD )
     type = ProfilingArray;
@@ -1315,7 +1314,7 @@ void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
     ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) :
     Constant::getNullValue(voidPtr);
 
-  const StructType* at = ftEntryTypeBuilder::get(*Context);
+  StructType* at = ftEntryTypeBuilder::get(*Context);
   ConstantStruct* functionEntry =
     (ConstantStruct*)ConstantStruct::get(at, entryArray);
   ftInit.push_back(functionEntry);
@@ -1379,8 +1378,8 @@ bool PathProfiler::runOnModule(Module &M) {
     runOnFunction(ftInit, *F, M);
   }
 
-  const Type *t = ftEntryTypeBuilder::get(*Context);
-  const ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
+  Type *t = ftEntryTypeBuilder::get(*Context);
+  ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
   Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit);
 
   DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n");
@@ -1388,7 +1387,7 @@ bool PathProfiler::runOnModule(Module &M) {
   GlobalVariable* functionTable =
     new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage,
                        ftInitConstant, "functionPathTable");
-  const Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
+  Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
   InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable,
                           PointerType::getUnqual(eltType));
 
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
index 445a5b6..de57cd1 100644
--- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
@@ -25,9 +25,9 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
                                    GlobalValue *Array,
                                    PointerType *arrayType) {
   LLVMContext &Context = MainFn->getContext();
-  const Type *ArgVTy =
+  Type *ArgVTy =
     PointerType::getUnqual(Type::getInt8PtrTy(Context));
-  const PointerType *UIntPtr = arrayType ? arrayType :
+  PointerType *UIntPtr = arrayType ? arrayType :
     Type::getInt32PtrTy(Context);
   Module &M = *MainFn->getParent();
   Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context),
@@ -51,8 +51,7 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
                              Constant::getNullValue(Type::getInt32Ty(Context)));
   unsigned NumElements = 0;
   if (Array) {
-    Args[2] = ConstantExpr::getGetElementPtr(Array, &GEPIndices[0],
-                                             GEPIndices.size());
+    Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices);
     NumElements =
       cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
   } else {
@@ -108,7 +107,7 @@ void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
 void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
                                    GlobalValue *CounterArray, bool beginning) {
   // Insert the increment after any alloca or PHI instructions...
-  BasicBlock::iterator InsertPos = beginning ? BB->getFirstNonPHI() :
+  BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() :
                                    BB->getTerminator();
   while (isa<AllocaInst>(InsertPos))
     ++InsertPos;
@@ -120,7 +119,7 @@ void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
   Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context));
   Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum);
   Constant *ElementPtr =
-    ConstantExpr::getGetElementPtr(CounterArray, &Indices[0], Indices.size());
+    ConstantExpr::getGetElementPtr(CounterArray, Indices);
 
   // Load, increment and store the value back.
   Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
@@ -137,7 +136,7 @@ void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) {
     Type::getInt32Ty(Mod->getContext()),
     FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo()
   };
-  const StructType *GlobalDtorElemTy =
+  StructType *GlobalDtorElemTy =
       StructType::get(Mod->getContext(), GlobalDtorElems, false);
 
   // Construct the new element we'll be adding.
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index a5adb5e..ba214d1 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -57,6 +57,7 @@ bool ADCE::runOnFunction(Function& F) {
   for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
     if (isa<TerminatorInst>(I.getInstructionIterator()) ||
         isa<DbgInfoIntrinsic>(I.getInstructionIterator()) ||
+        isa<LandingPadInst>(I.getInstructionIterator()) ||
         I->mayHaveSideEffects()) {
       alive.insert(I.getInstructionIterator());
       worklist.push_back(I.getInstructionIterator());
@@ -65,7 +66,6 @@ bool ADCE::runOnFunction(Function& F) {
   // Propagate liveness backwards to operands.
   while (!worklist.empty()) {
     Instruction* curr = worklist.pop_back_val();
-    
     for (Instruction::op_iterator OI = curr->op_begin(), OE = curr->op_end();
          OI != OE; ++OI)
       if (Instruction* Inst = dyn_cast<Instruction>(OI))
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index c223da6..79bcae5 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -29,6 +29,14 @@ add_llvm_library(LLVMScalarOpts
   SimplifyCFGPass.cpp
   SimplifyLibCalls.cpp
   Sink.cpp
-  TailDuplication.cpp
   TailRecursionElimination.cpp
   )
+
+add_llvm_library_dependencies(LLVMScalarOpts
+  LLVMAnalysis
+  LLVMCore
+  LLVMInstCombine
+  LLVMSupport
+  LLVMTarget
+  LLVMTransformUtils
+  )
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 0af14ed..f8f18b2 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -58,6 +58,7 @@ STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
 STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
 STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
+STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
@@ -104,12 +105,13 @@ namespace {
     void EliminateMostlyEmptyBlock(BasicBlock *BB);
     bool OptimizeBlock(BasicBlock &BB);
     bool OptimizeInst(Instruction *I);
-    bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy);
+    bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
     bool OptimizeInlineAsmInst(CallInst *CS);
     bool OptimizeCallInst(CallInst *CI);
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
     bool DupRetToEnableTailCallOpts(ReturnInst *RI);
+    bool PlaceDbgValues(Function &F);
   };
 }
 
@@ -132,6 +134,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
+  // llvm.dbg.value is far away from the value then iSel may not be able
+  // handle it properly. iSel will drop llvm.dbg.value if it can not 
+  // find a node corresponding to the value.
+  EverMadeChange |= PlaceDbgValues(F);
+
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
@@ -410,8 +417,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
     CastInst *&InsertedCast = InsertedCasts[UserBB];
 
     if (!InsertedCast) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedCast =
         CastInst::Create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "",
                          InsertPt);
@@ -467,8 +473,7 @@ static bool OptimizeCmpExpression(CmpInst *CI) {
     CmpInst *&InsertedCmp = InsertedCmps[UserBB];
 
     if (!InsertedCmp) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedCmp =
         CmpInst::Create(CI->getOpcode(),
                         CI->getPredicate(),  CI->getOperand(0),
@@ -528,7 +533,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
     bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
-    const Type *ReturnTy = CI->getType();
+    Type *ReturnTy = CI->getType();
     Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);    
     
     // Substituting this can cause recursive simplifications, which can
@@ -551,22 +556,6 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
 
-  // llvm.dbg.value is far away from the value then iSel may not be able
-  // handle it properly. iSel will drop llvm.dbg.value if it can not 
-  // find a node corresponding to the value.
-  if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(CI))
-    if (Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue()))
-      if (!VI->isTerminator() &&
-          (DVI->getParent() != VI->getParent() || DT->dominates(DVI, VI))) {
-        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
-        DVI->removeFromParent();
-        if (isa<PHINode>(VI))
-          DVI->insertBefore(VI->getParent()->getFirstNonPHI());
-        else
-          DVI->insertAfter(VI);
-        return true;
-      }
-
   // We'll need TargetData from here on out.
   const TargetData *TD = TLI ? TLI->getTargetData() : 0;
   if (!TD) return false;
@@ -724,7 +713,7 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
 /// This method is used to optimize both load/store and inline asms with memory
 /// operands.
 bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
-                                        const Type *AccessTy) {
+                                        Type *AccessTy) {
   Value *Repl = Addr;
   
   // Try to collapse single-value PHI nodes.  This is necessary to undo 
@@ -746,13 +735,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     worklist.pop_back();
     
     // Break use-def graph loops.
-    if (Visited.count(V)) {
+    if (!Visited.insert(V)) {
       Consensus = 0;
       break;
     }
     
-    Visited.insert(V);
-    
     // For a PHI node, push all of its incoming values.
     if (PHINode *P = dyn_cast<PHINode>(V)) {
       for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i)
@@ -763,7 +750,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // For non-PHIs, determine the addressing mode being computed.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode =
-      AddressingModeMatcher::Match(V, AccessTy,MemoryInst,
+      AddressingModeMatcher::Match(V, AccessTy, MemoryInst,
                                    NewAddrModeInsts, *TLI);
 
     // This check is broken into two cases with very similar code to avoid using
@@ -822,7 +809,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // Insert this computation right after this user.  Since our caller is
   // scanning from the top of the BB to the bottom, reuse of the expr are
   // guaranteed to happen later.
-  BasicBlock::iterator InsertPt = MemoryInst;
+  IRBuilder<> Builder(MemoryInst);
 
   // Now that we determined the addressing expression we want to use and know
   // that we have to sink it into this block.  Check to see if we have already
@@ -833,11 +820,11 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
     if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt);
+      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
-    const Type *IntPtrTy =
+    Type *IntPtrTy =
           TLI->getTargetData()->getIntPtrType(AccessTy->getContext());
 
     Value *Result = 0;
@@ -850,10 +837,9 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (AddrMode.BaseReg) {
       Value *V = AddrMode.BaseReg;
       if (V->getType()->isPointerTy())
-        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
       if (V->getType() != IntPtrTy)
-        V = CastInst::CreateIntegerCast(V, IntPtrTy, /*isSigned=*/true,
-                                        "sunkaddr", InsertPt);
+        V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
       Result = V;
     }
 
@@ -863,29 +849,27 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       if (V->getType() == IntPtrTy) {
         // done.
       } else if (V->getType()->isPointerTy()) {
-        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
       } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
                  cast<IntegerType>(V->getType())->getBitWidth()) {
-        V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
       } else {
-        V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
+        V = Builder.CreateSExt(V, IntPtrTy, "sunkaddr");
       }
       if (AddrMode.Scale != 1)
-        V = BinaryOperator::CreateMul(V, ConstantInt::get(IntPtrTy,
-                                                                AddrMode.Scale),
-                                      "sunkaddr", InsertPt);
+        V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                              "sunkaddr");
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
 
     // Add in the BaseGV if present.
     if (AddrMode.BaseGV) {
-      Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
-                                  InsertPt);
+      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
@@ -894,7 +878,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (AddrMode.BaseOffs) {
       Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
       if (Result)
-        Result = BinaryOperator::CreateAdd(Result, V, "sunkaddr", InsertPt);
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
       else
         Result = V;
     }
@@ -902,7 +886,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (Result == 0)
       SunkAddr = Constant::getNullValue(Addr->getType());
     else
-      SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt);
+      SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
   }
 
   MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
@@ -1059,8 +1043,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
     Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
 
     if (!InsertedTrunc) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstNonPHI();
-
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
       InsertedTrunc = new TruncInst(I, Src->getType(), "", InsertPt);
     }
 
@@ -1159,3 +1142,34 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
 
   return MadeChange;
 }
+
+// llvm.dbg.value is far away from the value then iSel may not be able
+// handle it properly. iSel will drop llvm.dbg.value if it can not 
+// find a node corresponding to the value.
+bool CodeGenPrepare::PlaceDbgValues(Function &F) {
+  bool MadeChange = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    Instruction *PrevNonDbgInst = NULL;
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+      Instruction *Insn = BI; ++BI;
+      DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
+      if (!DVI) {
+        PrevNonDbgInst = Insn;
+        continue;
+      }
+
+      Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
+      if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(VI->getParent()->getFirstInsertionPt());
+        else
+          DVI->insertAfter(VI);
+        MadeChange = true;
+        ++NumDbgValueMoved;
+      }
+    }
+  }
+  return MadeChange;
+}
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index cb9b5be..a593d0f 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -52,18 +52,18 @@ namespace {
       AA = &getAnalysis<AliasAnalysis>();
       MD = &getAnalysis<MemoryDependenceAnalysis>();
       DominatorTree &DT = getAnalysis<DominatorTree>();
-      
+
       bool Changed = false;
       for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
         // Only check non-dead blocks.  Dead blocks may have strange pointer
         // cycles that will confuse alias analysis.
         if (DT.isReachableFromEntry(I))
           Changed |= runOnBasicBlock(*I);
-      
+
       AA = 0; MD = 0;
       return Changed;
     }
-    
+
     bool runOnBasicBlock(BasicBlock &BB);
     bool HandleFree(CallInst *F);
     bool handleEndBlock(BasicBlock &BB);
@@ -105,34 +105,34 @@ static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
                                   SmallPtrSet<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
-  
+
   NowDeadInsts.push_back(I);
   --NumFastOther;
-  
+
   // Before we touch this instruction, remove it from memdep!
   do {
     Instruction *DeadInst = NowDeadInsts.pop_back_val();
     ++NumFastOther;
-    
+
     // This instruction is dead, zap it, in stages.  Start by removing it from
     // MemDep, which needs to know the operands and needs it to be in the
     // function.
     MD.removeInstruction(DeadInst);
-    
+
     for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
       Value *Op = DeadInst->getOperand(op);
       DeadInst->setOperand(op, 0);
-      
+
       // If this operand just became dead, add it to the NowDeadInsts list.
       if (!Op->use_empty()) continue;
-      
+
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
         if (isInstructionTriviallyDead(OpI))
           NowDeadInsts.push_back(OpI);
     }
-    
+
     DeadInst->eraseFromParent();
-    
+
     if (ValueSet) ValueSet->erase(DeadInst);
   } while (!NowDeadInsts.empty());
 }
@@ -159,11 +159,13 @@ static bool hasMemoryWrite(Instruction *I) {
 }
 
 /// getLocForWrite - Return a Location stored to by the specified instruction.
+/// If isRemovable returns true, this function and getLocForRead completely
+/// describe the memory operations for this instruction.
 static AliasAnalysis::Location
 getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
     return AA.getLocation(SI);
-  
+
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) {
     // memcpy/memmove/memset.
     AliasAnalysis::Location Loc = AA.getLocationForDest(MI);
@@ -174,10 +176,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
       return AliasAnalysis::Location();
     return Loc;
   }
-  
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
   if (II == 0) return AliasAnalysis::Location();
-  
+
   switch (II->getIntrinsicID()) {
   default: return AliasAnalysis::Location(); // Unhandled intrinsic.
   case Intrinsic::init_trampoline:
@@ -185,7 +187,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // that we should use the size of the pointee type.  This isn't valid for
     // init.trampoline, which writes more than an i8.
     if (AA.getTargetData() == 0) return AliasAnalysis::Location();
-      
+
     // FIXME: We don't know the size of the trampoline, so we can't really
     // handle it here.
     return AliasAnalysis::Location(II->getArgOperand(0));
@@ -198,10 +200,10 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
 
 /// getLocForRead - Return the location read by the specified "hasMemoryWrite"
 /// instruction if any.
-static AliasAnalysis::Location 
+static AliasAnalysis::Location
 getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
   assert(hasMemoryWrite(Inst) && "Unknown instruction case");
-  
+
   // The only instructions that both read and write are the mem transfer
   // instructions (memcpy/memmove).
   if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst))
@@ -213,10 +215,10 @@ getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
 /// isRemovable - If the value of this instruction and the memory it writes to
 /// is unused, may we delete this instruction?
 static bool isRemovable(Instruction *I) {
-  // Don't remove volatile stores.
+  // Don't remove volatile/atomic stores.
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return !SI->isVolatile();
-  
+    return SI->isUnordered();
+
   IntrinsicInst *II = cast<IntrinsicInst>(I);
   switch (II->getIntrinsicID()) {
   default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate");
@@ -227,7 +229,7 @@ static bool isRemovable(Instruction *I) {
   case Intrinsic::init_trampoline:
     // Always safe to remove init_trampoline.
     return true;
-    
+
   case Intrinsic::memset:
   case Intrinsic::memmove:
   case Intrinsic::memcpy:
@@ -255,16 +257,16 @@ static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) {
   const TargetData *TD = AA.getTargetData();
   if (TD == 0)
     return AliasAnalysis::UnknownSize;
-  
+
   if (AllocaInst *A = dyn_cast<AllocaInst>(V)) {
     // Get size information for the alloca
     if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
       return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType());
     return AliasAnalysis::UnknownSize;
   }
-  
+
   assert(isa<Argument>(V) && "Expected AllocaInst or Argument!");
-  const PointerType *PT = cast<PointerType>(V->getType());
+  PointerType *PT = cast<PointerType>(V->getType());
   return TD->getTypeAllocSize(PT->getElementType());
 }
 
@@ -287,7 +289,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
                                 AliasAnalysis &AA) {
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
-  
+
   // If the start pointers are the same, we just have to compare sizes to see if
   // the later store was larger than the earlier store.
   if (P1 == P2) {
@@ -302,33 +304,33 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
         return Later.Ptr->getType() == Earlier.Ptr->getType();
       return false;
     }
-    
+
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size < Earlier.Size)
       return false;
     return true;
   }
-  
+
   // Otherwise, we have to have size information, and the later store has to be
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
       Earlier.Size == AliasAnalysis::UnknownSize ||
       Later.Size <= Earlier.Size || AA.getTargetData() == 0)
     return false;
-  
+
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
   // other store to the same object.
   const TargetData &TD = *AA.getTargetData();
-  
+
   const Value *UO1 = GetUnderlyingObject(P1, &TD),
               *UO2 = GetUnderlyingObject(P2, &TD);
-  
+
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
     return false;
-  
+
   // If the "Later" store is to a recognizable object, get its size.
   if (isObjectPointerWithTrustworthySize(UO2)) {
     uint64_t ObjectSize =
@@ -336,26 +338,26 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
     if (ObjectSize == Later.Size)
       return true;
   }
-  
+
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
   // pointers are equal, then we can reason about the two stores.
   int64_t EarlierOff = 0, LaterOff = 0;
   const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, TD);
   const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, TD);
-  
+
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
     return false;
 
   // The later store completely overlaps the earlier store if:
-  // 
+  //
   // 1. Both start at the same offset and the later one's size is greater than
   //    or equal to the earlier one's, or
   //
   //      |--earlier--|
   //      |--   later   --|
-  //      
+  //
   // 2. The earlier store has an offset greater than the later offset, but which
   //    still lies completely within the later store.
   //
@@ -373,7 +375,7 @@ static bool isCompleteOverwrite(const AliasAnalysis::Location &Later,
 
 /// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a
 /// memory region into an identical pointer) then it doesn't actually make its
-/// input dead in the traditional sense.  Consider this case: 
+/// input dead in the traditional sense.  Consider this case:
 ///
 ///   memcpy(A <- B)
 ///   memcpy(A <- A)
@@ -391,10 +393,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // location read.
   AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA);
   if (InstReadLoc.Ptr == 0) return false;  // Not a reading instruction.
-  
+
   // If the read and written loc obviously don't alias, it isn't a read.
   if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
-  
+
   // Okay, 'Inst' may copy over itself.  However, we can still remove a the
   // DepWrite instruction if we can prove that it reads from the same location
   // as Inst.  This handles useful cases like:
@@ -404,10 +406,10 @@ static bool isPossibleSelfRead(Instruction *Inst,
   // aliases, so removing the first memcpy is safe (assuming it writes <= #
   // bytes as the second one.
   AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA);
-  
+
   if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
     return false;
-  
+
   // If DepWrite doesn't read memory or if we can't prove it is a must alias,
   // then it can't be considered dead.
   return true;
@@ -420,43 +422,43 @@ static bool isPossibleSelfRead(Instruction *Inst,
 
 bool DSE::runOnBasicBlock(BasicBlock &BB) {
   bool MadeChange = false;
-  
+
   // Do a top-down walk on the BB.
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
     Instruction *Inst = BBI++;
-    
+
     // Handle 'free' calls specially.
     if (CallInst *F = isFreeCall(Inst)) {
       MadeChange |= HandleFree(F);
       continue;
     }
-    
+
     // If we find something that writes memory, get its memory dependence.
     if (!hasMemoryWrite(Inst))
       continue;
 
     MemDepResult InstDep = MD->getDependency(Inst);
-    
+
     // Ignore any store where we can't find a local dependence.
     // FIXME: cross-block DSE would be fun. :)
-    if (InstDep.isNonLocal() || InstDep.isUnknown())
+    if (!InstDep.isDef() && !InstDep.isClobber())
       continue;
-     
+
     // If we're storing the same value back to a pointer that we just
     // loaded from, then the store can be removed.
     if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
       if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) {
         if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
-            SI->getOperand(0) == DepLoad && !SI->isVolatile()) {
+            SI->getOperand(0) == DepLoad && isRemovable(SI)) {
           DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n  "
                        << "LOAD: " << *DepLoad << "\n  STORE: " << *SI << '\n');
-          
+
           // DeleteDeadInstruction can delete the current instruction.  Save BBI
           // in case we need it.
           WeakVH NextInst(BBI);
-          
+
           DeleteDeadInstruction(SI, *MD);
-          
+
           if (NextInst == 0)  // Next instruction deleted.
             BBI = BB.begin();
           else if (BBI != BB.begin())  // Revisit this instruction if possible.
@@ -467,15 +469,15 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
         }
       }
     }
-    
+
     // Figure out what location is being stored to.
     AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA);
 
     // If we didn't get a useful location, fail.
     if (Loc.Ptr == 0)
       continue;
-    
-    while (!InstDep.isNonLocal() && !InstDep.isUnknown()) {
+
+    while (InstDep.isDef() || InstDep.isClobber()) {
       // Get the memory clobbered by the instruction we depend on.  MemDep will
       // skip any instructions that 'Loc' clearly doesn't interact with.  If we
       // end up depending on a may- or must-aliased load, then we can't optimize
@@ -496,12 +498,12 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) {
         DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
               << *DepWrite << "\n  KILLER: " << *Inst << '\n');
-        
+
         // Delete the store and now-dead instructions that feed it.
         DeleteDeadInstruction(DepWrite, *MD);
         ++NumFastStores;
         MadeChange = true;
-        
+
         // DeleteDeadInstruction can delete the current instruction in loop
         // cases, reset BBI.
         BBI = Inst;
@@ -509,7 +511,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           --BBI;
         break;
       }
-      
+
       // If this is a may-aliased store that is clobbering the store value, we
       // can keep searching past it for another must-aliased pointer that stores
       // to the same location.  For example, in:
@@ -519,20 +521,20 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
       // we can remove the first store to P even though we don't know if P and Q
       // alias.
       if (DepWrite == &BB.front()) break;
-      
+
       // Can't look past this instruction if it might read 'Loc'.
       if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref)
         break;
-        
+
       InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB);
     }
   }
-  
+
   // If this block ends in a return, unwind, or unreachable, all allocas are
   // dead at its end, which means stores to them are also dead.
   if (BB.getTerminator()->getNumSuccessors() == 0)
     MadeChange |= handleEndBlock(BB);
-  
+
   return MadeChange;
 }
 
@@ -543,18 +545,18 @@ bool DSE::HandleFree(CallInst *F) {
 
   MemDepResult Dep = MD->getDependency(F);
 
-  while (!Dep.isNonLocal() && !Dep.isUnknown()) {
+  while (Dep.isDef() || Dep.isClobber()) {
     Instruction *Dependency = Dep.getInst();
     if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency))
       return MadeChange;
-  
+
     Value *DepPointer =
       GetUnderlyingObject(getStoredPointerOperand(Dependency));
 
     // Check for aliasing.
     if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
       return MadeChange;
-  
+
     // DCE instructions only used to calculate that store
     DeleteDeadInstruction(Dependency, *MD);
     ++NumFastStores;
@@ -567,7 +569,7 @@ bool DSE::HandleFree(CallInst *F) {
     //    free(s);
     Dep = MD->getDependency(F);
   };
-  
+
   return MadeChange;
 }
 
@@ -579,28 +581,28 @@ bool DSE::HandleFree(CallInst *F) {
 /// ret void
 bool DSE::handleEndBlock(BasicBlock &BB) {
   bool MadeChange = false;
-  
+
   // Keep track of all of the stack objects that are dead at the end of the
   // function.
   SmallPtrSet<Value*, 16> DeadStackObjects;
-  
+
   // Find all of the alloca'd pointers in the entry block.
   BasicBlock *Entry = BB.getParent()->begin();
   for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
       DeadStackObjects.insert(AI);
-  
+
   // Treat byval arguments the same, stores to them are dead at the end of the
   // function.
   for (Function::arg_iterator AI = BB.getParent()->arg_begin(),
        AE = BB.getParent()->arg_end(); AI != AE; ++AI)
     if (AI->hasByValAttr())
       DeadStackObjects.insert(AI);
-  
+
   // Scan the basic block backwards
   for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
     --BBI;
-    
+
     // If we find a store, check to see if it points into a dead stack value.
     if (hasMemoryWrite(BBI) && isRemovable(BBI)) {
       // See through pointer-to-pointer bitcasts
@@ -609,10 +611,10 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       // Stores to stack values are valid candidates for removal.
       if (DeadStackObjects.count(Pointer)) {
         Instruction *Dead = BBI++;
-        
+
         DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n  DEAD: "
                      << *Dead << "\n  Object: " << *Pointer << '\n');
-        
+
         // DCE instructions only used to calculate that store.
         DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
         ++NumFastStores;
@@ -620,7 +622,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
         continue;
       }
     }
-    
+
     // Remove any dead non-memory-mutating instructions.
     if (isInstructionTriviallyDead(BBI)) {
       Instruction *Inst = BBI++;
@@ -629,55 +631,61 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
       MadeChange = true;
       continue;
     }
-    
+
     if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) {
       DeadStackObjects.erase(A);
       continue;
     }
-    
+
     if (CallSite CS = cast<Value>(BBI)) {
       // If this call does not access memory, it can't be loading any of our
       // pointers.
       if (AA->doesNotAccessMemory(CS))
         continue;
-      
+
       // If the call might load from any of our allocas, then any store above
       // the call is live.
       SmallVector<Value*, 8> LiveAllocas;
       for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
            E = DeadStackObjects.end(); I != E; ++I) {
         // See if the call site touches it.
-        AliasAnalysis::ModRefResult A = 
+        AliasAnalysis::ModRefResult A =
           AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA));
-        
+
         if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
           LiveAllocas.push_back(*I);
       }
-      
+
       for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
            E = LiveAllocas.end(); I != E; ++I)
         DeadStackObjects.erase(*I);
-      
+
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
       if (DeadStackObjects.empty())
         return MadeChange;
-      
+
       continue;
     }
-    
+
     AliasAnalysis::Location LoadedLoc;
-    
+
     // If we encounter a use of the pointer, it is no longer considered dead
     if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+      if (!L->isUnordered()) // Be conservative with atomic/volatile load
+        break;
       LoadedLoc = AA->getLocation(L);
     } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
       LoadedLoc = AA->getLocation(V);
     } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) {
       LoadedLoc = AA->getLocationForSource(MTI);
-    } else {
-      // Not a loading instruction.
+    } else if (!BBI->mayReadFromMemory()) {
+      // Instruction doesn't read memory.  Note that stores that weren't removed
+      // above will hit this case.
       continue;
+    } else {
+      // Unknown inst; assume it clobbers everything.
+      break;
     }
 
     // Remove any allocas from the DeadPointer set that are loaded, as this
@@ -689,7 +697,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (DeadStackObjects.empty())
       break;
   }
-  
+
   return MadeChange;
 }
 
@@ -703,14 +711,14 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
   // A constant can't be in the dead pointer set.
   if (isa<Constant>(UnderlyingPointer))
     return;
-  
+
   // If the kill pointer can be easily reduced to an alloca, don't bother doing
   // extraneous AA queries.
   if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
     DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer));
     return;
   }
-  
+
   SmallVector<Value*, 16> NowLive;
   for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(),
        E = DeadStackObjects.end(); I != E; ++I) {
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 3d3f17b..c0223d2 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -92,7 +92,7 @@ unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   // Hash in all of the operands as pointers.
   unsigned Res = 0;
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
-    Res ^= getHash(Inst->getOperand(i)) << i;
+    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
 
   if (CastInst *CI = dyn_cast<CastInst>(Inst))
     Res ^= getHash(CI->getType());
@@ -185,7 +185,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
   for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) {
     assert(!Inst->getOperand(i)->getType()->isMetadataTy() &&
            "Cannot value number calls with metadata operands");
-    Res ^= getHash(Inst->getOperand(i)) << i;
+    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
   }
   
   // Mix in the opcode.
@@ -357,7 +357,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // If this is a non-volatile load, process it.
     if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
       // Ignore volatile loads.
-      if (LI->isVolatile()) {
+      if (!LI->isSimple()) {
         LastStore = 0;
         continue;
       }
@@ -437,7 +437,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
          std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration));
         
         // Remember that this was the last store we saw for DSE.
-        if (!SI->isVolatile())
+        if (SI->isSimple())
           LastStore = SI;
       }
     }
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 87b7317..cbfdbcd 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -41,12 +41,16 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 STATISTIC(NumGVNInstr,  "Number of instructions deleted");
 STATISTIC(NumGVNLoad,   "Number of loads deleted");
 STATISTIC(NumGVNPRE,    "Number of instructions PRE'd");
 STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
 STATISTIC(NumPRELoad,   "Number of loads PRE'd");
 
 static cl::opt<bool> EnablePRE("enable-pre",
@@ -63,7 +67,7 @@ static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
 namespace {
   struct Expression {
     uint32_t opcode;
-    const Type *type;
+    Type *type;
     SmallVector<uint32_t, 4> varargs;
 
     Expression(uint32_t o = ~2U) : opcode(o) { }
@@ -548,6 +552,9 @@ namespace {
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
+    unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
+                                         BasicBlock *Root);
+    bool propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root);
   };
 
   char GVN::ID = 0;
@@ -655,7 +662,7 @@ SpeculationFailure:
 /// CanCoerceMustAliasedValueToLoad - Return true if
 /// CoerceAvailableValueToLoadType will succeed.
 static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
-                                            const Type *LoadTy,
+                                            Type *LoadTy,
                                             const TargetData &TD) {
   // If the loaded or stored value is an first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
@@ -680,17 +687,17 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
 ///
 /// If we can't do it, return null.
 static Value *CoerceAvailableValueToLoadType(Value *StoredVal, 
-                                             const Type *LoadedTy,
+                                             Type *LoadedTy,
                                              Instruction *InsertPt,
                                              const TargetData &TD) {
   if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
     return 0;
   
   // If this is already the right type, just return it.
-  const Type *StoredValTy = StoredVal->getType();
+  Type *StoredValTy = StoredVal->getType();
   
-  uint64_t StoreSize = TD.getTypeStoreSizeInBits(StoredValTy);
-  uint64_t LoadSize = TD.getTypeStoreSizeInBits(LoadedTy);
+  uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy);
   
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
@@ -704,7 +711,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
       StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
     }
     
-    const Type *TypeToCastTo = LoadedTy;
+    Type *TypeToCastTo = LoadedTy;
     if (TypeToCastTo->isPointerTy())
       TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext());
     
@@ -743,7 +750,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   }
   
   // Truncate the integer to the right size now.
-  const Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize);
   StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt);
   
   if (LoadedTy == NewIntTy)
@@ -765,7 +772,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
 /// Check this case to see if there is anything more we can do before we give
 /// up.  This returns -1 if we have to give up, or a byte number in the stored
 /// value of the piece that feeds the load.
-static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const TargetData &TD) {
@@ -839,7 +846,7 @@ static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr,
 
 /// AnalyzeLoadFromClobberingStore - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering store.
-static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                           StoreInst *DepSI,
                                           const TargetData &TD) {
   // Cannot handle reading from store of first-class aggregate yet.
@@ -856,7 +863,7 @@ static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr,
 /// AnalyzeLoadFromClobberingLoad - This function is called when we have a
 /// memdep query of a load that ends up being clobbered by another load.  See if
 /// the other load can feed into the second load.
-static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
                                          LoadInst *DepLI, const TargetData &TD){
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
@@ -883,7 +890,7 @@ static int AnalyzeLoadFromClobberingLoad(const Type *LoadTy, Value *LoadPtr,
 
 
 
-static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
+static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
                                             MemIntrinsic *MI,
                                             const TargetData &TD) {
   // If the mem operation is a non-constant size, we can't handle it.
@@ -920,7 +927,7 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
   Constant *OffsetCst = 
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
   if (ConstantFoldLoadFromConstPtr(Src, &TD))
     return Offset;
@@ -934,7 +941,7 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr,
 /// mustalias.  Check this case to see if there is anything more we can do
 /// before we give up.
 static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
-                                   const Type *LoadTy,
+                                   Type *LoadTy,
                                    Instruction *InsertPt, const TargetData &TD){
   LLVMContext &Ctx = SrcVal->getType()->getContext();
   
@@ -946,10 +953,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx), "tmp");
+    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx));
   if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8),
-                                   "tmp");
+    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
   
   // Shift the bits to the least significant depending on endianness.
   unsigned ShiftAmt;
@@ -959,11 +965,10 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
     ShiftAmt = (StoreSize-LoadSize-Offset)*8;
   
   if (ShiftAmt)
-    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt, "tmp");
+    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
   
   if (LoadSize != StoreSize)
-    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8),
-                                 "tmp");
+    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
   
   return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD);
 }
@@ -974,7 +979,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
 /// because the pointers don't mustalias.  Check this case to see if there is
 /// anything more we can do before we give up.
 static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
-                                  const Type *LoadTy, Instruction *InsertPt,
+                                  Type *LoadTy, Instruction *InsertPt,
                                   GVN &gvn) {
   const TargetData &TD = *gvn.getTargetData();
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
@@ -982,8 +987,8 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
   unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType());
   unsigned LoadSize = TD.getTypeStoreSize(LoadTy);
   if (Offset+LoadSize > SrcValSize) {
-    assert(!SrcVal->isVolatile() && "Cannot widen volatile load!");
-    assert(isa<IntegerType>(SrcVal->getType())&&"Can't widen non-integer load");
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
     // If we have a load/load clobber an DepLI can be widened to cover this
     // load, then we should widen it to the next power of 2 size big enough!
     unsigned NewLoadSize = Offset+LoadSize;
@@ -996,7 +1001,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     // memdep queries will find the new load.  We can't easily remove the old
     // load completely because it is already in the value numbering table.
     IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    const Type *DestPTy = 
+    Type *DestPTy = 
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
     DestPTy = PointerType::get(DestPTy, 
                        cast<PointerType>(PtrVal->getType())->getAddressSpace());
@@ -1034,7 +1039,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
 /// GetMemInstValueForLoad - This function is called when we have a
 /// memdep query of a load that ends up being a clobbering mem intrinsic.
 static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
-                                     const Type *LoadTy, Instruction *InsertPt,
+                                     Type *LoadTy, Instruction *InsertPt,
                                      const TargetData &TD){
   LLVMContext &Ctx = LoadTy->getContext();
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
@@ -1081,7 +1086,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
                                  llvm::Type::getInt8PtrTy(Src->getContext()));
   Constant *OffsetCst = 
   ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Src, &OffsetCst, 1);
+  Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
   Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
@@ -1154,7 +1159,7 @@ struct AvailableValueInBlock {
   
   /// MaterializeAdjustedValue - Emit code into this block to adjust the value
   /// defined here to the specified type.  This handles various coercion cases.
-  Value *MaterializeAdjustedValue(const Type *LoadTy, GVN &gvn) const {
+  Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
     Value *Res;
     if (isSimpleValue()) {
       Res = getSimpleValue();
@@ -1213,7 +1218,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   SSAUpdater SSAUpdate(&NewPHIs);
   SSAUpdate.Initialize(LI->getType(), LI->getName());
   
-  const Type *LoadTy = LI->getType();
+  Type *LoadTy = LI->getType();
   
   for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) {
     const AvailableValueInBlock &AV = ValuesPerBlock[i];
@@ -1274,7 +1279,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
 
   // If we had a phi translation failure, we'll have a single entry which is a
   // clobber in the current block.  Reject this early.
-  if (Deps.size() == 1 && Deps[0].getResult().isUnknown()) {
+  if (Deps.size() == 1
+      && !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber())
+  {
     DEBUG(
       dbgs() << "GVN: non-local load ";
       WriteAsOperand(dbgs(), LI);
@@ -1294,7 +1301,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
-    if (DepInfo.isUnknown()) {
+    if (!DepInfo.isDef() && !DepInfo.isClobber()) {
       UnavailableBlocks.push_back(DepBB);
       continue;
     }
@@ -1359,7 +1366,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       continue;
     }
 
-    assert(DepInfo.isDef() && "Expecting def here");
+    // DepInfo.isDef() here
 
     Instruction *DepInst = DepInfo.getInst();
 
@@ -1446,8 +1453,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
     Blockers.insert(UnavailableBlocks[i]);
 
-  // Lets find first basic block with more than one predecessor.  Walk backwards
-  // through predecessors if needed.
+  // Let's find the first basic block with more than one predecessor.  Walk
+  // backwards through predecessors if needed.
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
@@ -1519,10 +1526,19 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
               << Pred->getName() << "': " << *LI << '\n');
         return false;
       }
+
+      if (LoadBB->isLandingPad()) {
+        DEBUG(dbgs()
+              << "COULD NOT PRE LOAD BECAUSE OF LANDING PAD CRITICAL EDGE '"
+              << Pred->getName() << "': " << *LI << '\n');
+        return false;
+      }
+
       unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB);
       NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum));
     }
   }
+
   if (!NeedToSplit.empty()) {
     toSplit.append(NeedToSplit.begin(), NeedToSplit.end());
     return false;
@@ -1660,7 +1676,7 @@ bool GVN::processLoad(LoadInst *L) {
   if (!MD)
     return false;
 
-  if (L->isVolatile())
+  if (!L->isSimple())
     return false;
 
   if (L->use_empty()) {
@@ -1747,7 +1763,11 @@ bool GVN::processLoad(LoadInst *L) {
     return false;
   }
 
-  if (Dep.isUnknown()) {
+  // If it is defined in another block, try harder.
+  if (Dep.isNonLocal())
+    return processNonLocalLoad(L);
+
+  if (!Dep.isDef()) {
     DEBUG(
       // fast print dep, using operator<< on instruction is too slow.
       dbgs() << "GVN: load ";
@@ -1757,12 +1777,6 @@ bool GVN::processLoad(LoadInst *L) {
     return false;
   }
 
-  // If it is defined in another block, try harder.
-  if (Dep.isNonLocal())
-    return processNonLocalLoad(L);
-
-  assert(Dep.isDef() && "Expecting def here");
-
   Instruction *DepInst = Dep.getInst();
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
@@ -1874,6 +1888,133 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) {
   return Val;
 }
 
+/// replaceAllDominatedUsesWith - Replace all uses of 'From' with 'To' if the
+/// use is dominated by the given basic block.  Returns the number of uses that
+/// were replaced.
+unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To,
+                                          BasicBlock *Root) {
+  unsigned Count = 0;
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ) {
+    Instruction *User = cast<Instruction>(*UI);
+    unsigned OpNum = UI.getOperandNo();
+    ++UI;
+
+    if (DT->dominates(Root, User->getParent())) {
+      User->setOperand(OpNum, To);
+      ++Count;
+    }
+  }
+  return Count;
+}
+
+/// propagateEquality - The given values are known to be equal in every block
+/// dominated by 'Root'.  Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope.  Returns whether a change was made.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) {
+  if (LHS == RHS) return false;
+  assert(LHS->getType() == RHS->getType() && "Equal but types differ!");
+
+  // Don't try to propagate equalities between constants.
+  if (isa<Constant>(LHS) && isa<Constant>(RHS))
+    return false;
+
+  // Make sure that any constants are on the right-hand side.  In general the
+  // best results are obtained by placing the longest lived value on the RHS.
+  if (isa<Constant>(LHS))
+    std::swap(LHS, RHS);
+
+  // If neither term is constant then bail out.  This is not for correctness,
+  // it's just that the non-constant case is much less useful: it occurs just
+  // as often as the constant case but handling it hardly ever results in an
+  // improvement.
+  if (!isa<Constant>(RHS))
+    return false;
+
+  // If value numbering later deduces that an instruction in the scope is equal
+  // to 'LHS' then ensure it will be turned into 'RHS'.
+  addToLeaderTable(VN.lookup_or_add(LHS), RHS, Root);
+
+  // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope.
+  unsigned NumReplacements = replaceAllDominatedUsesWith(LHS, RHS, Root);
+  bool Changed = NumReplacements > 0;
+  NumGVNEqProp += NumReplacements;
+
+  // Now try to deduce additional equalities from this one.  For example, if the
+  // known equality was "(A != B)" == "false" then it follows that A and B are
+  // equal in the scope.  Only boolean equalities with an explicit true or false
+  // RHS are currently supported.
+  if (!RHS->getType()->isIntegerTy(1))
+    // Not a boolean equality - bail out.
+    return Changed;
+  ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+  if (!CI)
+    // RHS neither 'true' nor 'false' - bail out.
+    return Changed;
+  // Whether RHS equals 'true'.  Otherwise it equals 'false'.
+  bool isKnownTrue = CI->isAllOnesValue();
+  bool isKnownFalse = !isKnownTrue;
+
+  // If "A && B" is known true then both A and B are known true.  If "A || B"
+  // is known false then both A and B are known false.
+  Value *A, *B;
+  if ((isKnownTrue && match(LHS, m_And(m_Value(A), m_Value(B)))) ||
+      (isKnownFalse && match(LHS, m_Or(m_Value(A), m_Value(B))))) {
+    Changed |= propagateEquality(A, RHS, Root);
+    Changed |= propagateEquality(B, RHS, Root);
+    return Changed;
+  }
+
+  // If we are propagating an equality like "(A == B)" == "true" then also
+  // propagate the equality A == B.
+  if (ICmpInst *Cmp = dyn_cast<ICmpInst>(LHS)) {
+    // Only equality comparisons are supported.
+    if ((isKnownTrue && Cmp->getPredicate() == CmpInst::ICMP_EQ) ||
+        (isKnownFalse && Cmp->getPredicate() == CmpInst::ICMP_NE)) {
+      Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+      Changed |= propagateEquality(Op0, Op1, Root);
+    }
+    return Changed;
+  }
+
+  return Changed;
+}
+
+/// isOnlyReachableViaThisEdge - There is an edge from 'Src' to 'Dst'.  Return
+/// true if every path from the entry block to 'Dst' passes via this edge.  In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(BasicBlock *Src, BasicBlock *Dst,
+                                       DominatorTree *DT) {
+  // First off, there must not be more than one edge from Src to Dst, there
+  // should be exactly one.  So keep track of the number of times Src occurs
+  // as a predecessor of Dst and fail if it's more than once.  Secondly, any
+  // other predecessors of Dst should be dominated by Dst (see logic below).
+  bool SawEdgeFromSrc = false;
+  for (pred_iterator PI = pred_begin(Dst), PE = pred_end(Dst); PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    if (Pred == Src) {
+      // An edge from Src to Dst.
+      if (SawEdgeFromSrc)
+        // There are multiple edges from Src to Dst - fail.
+        return false;
+      SawEdgeFromSrc = true;
+      continue;
+    }
+    // If the predecessor is not dominated by Dst, then it must be possible to
+    // reach it either without passing through Src (and thus not via the edge)
+    // or by passing through Src but taking a different edge out of Src.  Either
+    // way it is possible to reach Dst without passing via the edge, so fail.
+    if (!DT->dominates(Dst, *PI))
+      return false;
+  }
+  assert(SawEdgeFromSrc && "No edge between these basic blocks!");
+
+  // Every path from the entry block to Dst must at some point pass to Dst from
+  // a predecessor that is not dominated by Dst.  This predecessor can only be
+  // Src, since all others are dominated by Dst.  As there is only one edge from
+  // Src to Dst, the path passes by this edge.
+  return true;
+}
 
 /// processInstruction - When calculating availability, handle an instruction
 /// by inserting it into the appropriate sets
@@ -1891,6 +2032,7 @@ bool GVN::processInstruction(Instruction *I) {
     if (MD && V->getType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(I);
+    ++NumGVNSimpl;
     return true;
   }
 
@@ -1903,30 +2045,45 @@ bool GVN::processInstruction(Instruction *I) {
     return false;
   }
 
-  // For conditions branches, we can perform simple conditional propagation on
+  // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
     if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
       return false;
-    
+
     Value *BranchCond = BI->getCondition();
-    uint32_t CondVN = VN.lookup_or_add(BranchCond);
-  
+
     BasicBlock *TrueSucc = BI->getSuccessor(0);
     BasicBlock *FalseSucc = BI->getSuccessor(1);
-  
-    if (TrueSucc->getSinglePredecessor())
-      addToLeaderTable(CondVN,
-                   ConstantInt::getTrue(TrueSucc->getContext()),
-                   TrueSucc);
-    if (FalseSucc->getSinglePredecessor())
-      addToLeaderTable(CondVN,
-                   ConstantInt::getFalse(TrueSucc->getContext()),
-                   FalseSucc);
-    
-    return false;
+    BasicBlock *Parent = BI->getParent();
+    bool Changed = false;
+
+    if (isOnlyReachableViaThisEdge(Parent, TrueSucc, DT))
+      Changed |= propagateEquality(BranchCond,
+                                   ConstantInt::getTrue(TrueSucc->getContext()),
+                                   TrueSucc);
+
+    if (isOnlyReachableViaThisEdge(Parent, FalseSucc, DT))
+      Changed |= propagateEquality(BranchCond,
+                                   ConstantInt::getFalse(FalseSucc->getContext()),
+                                   FalseSucc);
+
+    return Changed;
   }
-  
+
+  // For switches, propagate the case values into the case destinations.
+  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+    Value *SwitchCond = SI->getCondition();
+    BasicBlock *Parent = SI->getParent();
+    bool Changed = false;
+    for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
+      BasicBlock *Dst = SI->getSuccessor(i);
+      if (isOnlyReachableViaThisEdge(Parent, Dst, DT))
+        Changed |= propagateEquality(SwitchCond, SI->getCaseValue(i), Dst);
+    }
+    return Changed;
+  }
+
   // Instructions with void type don't return a value, so there's
   // no point in trying to find redudancies in them.
   if (I->getType()->isVoidTy()) return false;
@@ -2071,6 +2228,9 @@ bool GVN::performPRE(Function &F) {
     // Nothing to PRE in the entry block.
     if (CurrentBlock == &F.getEntryBlock()) continue;
 
+    // Don't perform PRE on a landing pad.
+    if (CurrentBlock->isLandingPad()) continue;
+
     for (BasicBlock::iterator BI = CurrentBlock->begin(),
          BE = CurrentBlock->end(); BI != BE; ) {
       Instruction *CurInst = BI++;
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index dee3d38..75fa011 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -11,17 +11,6 @@
 // computations derived from them) into simpler forms suitable for subsequent
 // analysis and transformation.
 //
-// This transformation makes the following changes to each loop with an
-// identifiable induction variable:
-//   1. All loops are transformed to have a SINGLE canonical induction variable
-//      which starts at zero and steps by one.
-//   2. The canonical induction variable is guaranteed to be the first PHI node
-//      in the loop header block.
-//   3. The canonical induction variable is guaranteed to be in a wide enough
-//      type so that IV expressions need not be (directly) zero-extended or
-//      sign-extended.
-//   4. Any pointer arithmetic recurrences are raised to use array subscripts.
-//
 // If the trip count of a loop is computable, this pass also makes the following
 // changes:
 //   1. The exit condition for the loop is canonicalized to compare the
@@ -33,9 +22,6 @@
 //      purpose of the loop is to compute the exit value of some derived
 //      expression, this transformation will make the loop dead.
 //
-// This transformation should be followed by strength reduction after all of the
-// desired loop transformations have been performed.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "indvars"
@@ -57,11 +43,11 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved     , "Number of aux indvars removed");
@@ -69,15 +55,21 @@ STATISTIC(NumWidened     , "Number of indvars widened");
 STATISTIC(NumInserted    , "Number of canonical indvars added");
 STATISTIC(NumReplaced    , "Number of exit values replaced");
 STATISTIC(NumLFTR        , "Number of loop exit tests replaced");
-STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
 STATISTIC(NumElimExt     , "Number of IV sign/zero extends eliminated");
-STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
-STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
 
-static cl::opt<bool> DisableIVRewrite(
-  "disable-iv-rewrite", cl::Hidden,
-  cl::desc("Disable canonical induction variable rewriting"));
+namespace llvm {
+  cl::opt<bool> EnableIVRewrite(
+    "enable-iv-rewrite", cl::Hidden,
+    cl::desc("Enable canonical induction variable rewriting"));
+
+  // Trip count verification can be enabled by default under NDEBUG if we
+  // implement a strong expression equivalence checker in SCEV. Until then, we
+  // use the verify-indvars flag, which may assert in some cases.
+  cl::opt<bool> VerifyIndvars(
+    "verify-indvars", cl::Hidden,
+    cl::desc("Verify the ScalarEvolution result after running indvars"));
+}
 
 namespace {
   class IndVarSimplify : public LoopPass {
@@ -105,12 +97,12 @@ namespace {
       AU.addRequired<ScalarEvolution>();
       AU.addRequiredID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
-      if (!DisableIVRewrite)
+      if (EnableIVRewrite)
         AU.addRequired<IVUsers>();
       AU.addPreserved<ScalarEvolution>();
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
-      if (!DisableIVRewrite)
+      if (EnableIVRewrite)
         AU.addPreserved<IVUsers>();
       AU.setPreservesCFG();
     }
@@ -125,24 +117,14 @@ namespace {
     void HandleFloatingPointIV(Loop *L, PHINode *PH);
     void RewriteNonIntegerIVs(Loop *L);
 
-    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
-
-    void SimplifyIVUsers(SCEVExpander &Rewriter);
-    void SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter);
+    void SimplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LPPassManager &LPM);
 
-    bool EliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
-    void EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
-    void EliminateIVRemainder(BinaryOperator *Rem,
-                              Value *IVOperand,
-                              bool IsSigned);
-
-    void SimplifyCongruentIVs(Loop *L);
+    void RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
 
     void RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter);
 
-    ICmpInst *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
-                                        PHINode *IndVar,
-                                        SCEVExpander &Rewriter);
+    Value *LinearFunctionTestReplace(Loop *L, const SCEV *BackedgeTakenCount,
+                                     PHINode *IndVar, SCEVExpander &Rewriter);
 
     void SinkUnusedInvariants(Loop *L);
   };
@@ -211,6 +193,36 @@ bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
   return true;
 }
 
+/// Determine the insertion point for this user. By default, insert immediately
+/// before the user. SCEVExpander or LICM will hoist loop invariants out of the
+/// loop. For PHI nodes, there may be multiple uses, so compute the nearest
+/// common dominator for the incoming blocks.
+static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
+                                          DominatorTree *DT) {
+  PHINode *PHI = dyn_cast<PHINode>(User);
+  if (!PHI)
+    return User;
+
+  Instruction *InsertPt = 0;
+  for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) {
+    if (PHI->getIncomingValue(i) != Def)
+      continue;
+
+    BasicBlock *InsertBB = PHI->getIncomingBlock(i);
+    if (!InsertPt) {
+      InsertPt = InsertBB->getTerminator();
+      continue;
+    }
+    InsertBB = DT->findNearestCommonDominator(InsertPt->getParent(), InsertBB);
+    InsertPt = InsertBB->getTerminator();
+  }
+  assert(InsertPt && "Missing phi operand");
+  assert((!isa<Instruction>(Def) ||
+          DT->dominates(cast<Instruction>(Def), InsertPt)) &&
+         "def does not dominate all uses");
+  return InsertPt;
+}
+
 //===----------------------------------------------------------------------===//
 // RewriteNonIntegerIVs and helpers. Prefer integer IVs.
 //===----------------------------------------------------------------------===//
@@ -337,14 +349,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // Positive and negative strides have different safety conditions.
   if (IncValue > 0) {
     // If we have a positive stride, we require the init to be less than the
-    // exit value and an equality or less than comparison.
-    if (InitValue >= ExitValue ||
-        NewPred == CmpInst::ICMP_SGT || NewPred == CmpInst::ICMP_SGE)
+    // exit value.
+    if (InitValue >= ExitValue)
       return;
 
     uint32_t Range = uint32_t(ExitValue-InitValue);
-    if (NewPred == CmpInst::ICMP_SLE) {
-      // Normalize SLE -> SLT, check for infinite loop.
+    // Check for infinite loop, either:
+    // while (i <= Exit) or until (i > Exit)
+    if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
       if (++Range == 0) return;  // Range overflows.
     }
 
@@ -364,14 +376,14 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
 
   } else {
     // If we have a negative stride, we require the init to be greater than the
-    // exit value and an equality or greater than comparison.
-    if (InitValue >= ExitValue ||
-        NewPred == CmpInst::ICMP_SLT || NewPred == CmpInst::ICMP_SLE)
+    // exit value.
+    if (InitValue <= ExitValue)
       return;
 
     uint32_t Range = uint32_t(InitValue-ExitValue);
-    if (NewPred == CmpInst::ICMP_SGE) {
-      // Normalize SGE -> SGT, check for infinite loop.
+    // Check for infinite loop, either:
+    // while (i >= Exit) or until (i < Exit)
+    if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
       if (++Range == 0) return;  // Range overflows.
     }
 
@@ -390,7 +402,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
       return;
   }
 
-  const IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+  IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
 
   // Insert new integer induction variable.
   PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
@@ -429,7 +441,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // platforms.
   if (WeakPH) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
-                                 PN->getParent()->getFirstNonPHI());
+                                 PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
     RecursivelyDeleteTriviallyDeadInstructions(PN);
   }
@@ -437,6 +449,8 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // Add a new IVUsers entry for the newly-created integer PHI.
   if (IU)
     IU->AddUsersIfInteresting(NewPHI);
+
+  Changed = true;
 }
 
 void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
@@ -582,45 +596,15 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
 //===----------------------------------------------------------------------===//
 //  Rewrite IV users based on a canonical IV.
-//  To be replaced by -disable-iv-rewrite.
+//  Only for use with -enable-iv-rewrite.
 //===----------------------------------------------------------------------===//
 
-/// SimplifyIVUsers - Iteratively perform simplification on IVUsers within this
-/// loop. IVUsers is treated as a worklist. Each successive simplification may
-/// push more users which may themselves be candidates for simplification.
-///
-/// This is the old approach to IV simplification to be replaced by
-/// SimplifyIVUsersNoRewrite.
-///
-void IndVarSimplify::SimplifyIVUsers(SCEVExpander &Rewriter) {
-  // Each round of simplification involves a round of eliminating operations
-  // followed by a round of widening IVs. A single IVUsers worklist is used
-  // across all rounds. The inner loop advances the user. If widening exposes
-  // more uses, then another pass through the outer loop is triggered.
-  for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) {
-    Instruction *UseInst = I->getUser();
-    Value *IVOperand = I->getOperandValToReplace();
-
-    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-      EliminateIVComparison(ICmp, IVOperand);
-      continue;
-    }
-    if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-      bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-      if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-        EliminateIVRemainder(Rem, IVOperand, IsSigned);
-        continue;
-      }
-    }
-  }
-}
-
-// FIXME: It is an extremely bad idea to indvar substitute anything more
-// complex than affine induction variables.  Doing so will put expensive
-// polynomial evaluations inside of the loop, and the str reduction pass
-// currently can only reduce affine polynomials.  For now just disable
-// indvar subst on anything more complex than an affine addrec, unless
-// it can be expanded to a trivial value.
+/// FIXME: It is an extremely bad idea to indvar substitute anything more
+/// complex than affine induction variables.  Doing so will put expensive
+/// polynomial evaluations inside of the loop, and the str reduction pass
+/// currently can only reduce affine polynomials.  For now just disable
+/// indvar subst on anything more complex than an affine addrec, unless
+/// it can be expanded to a trivial value.
 static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
   // Loop-invariant values are safe.
   if (SE->isLoopInvariant(S, L)) return true;
@@ -631,7 +615,8 @@ static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) {
     return AR->isAffine();
 
   // An add is safe it all its operands are safe.
-  if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) {
+  if (const SCEVCommutativeExpr *Commutative
+      = dyn_cast<SCEVCommutativeExpr>(S)) {
     for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(),
          E = Commutative->op_end(); I != E; ++I)
       if (!isSafe(*I, L, SE)) return false;
@@ -665,7 +650,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) {
   // of different sizes.
   for (IVUsers::iterator UI = IU->begin(), E = IU->end(); UI != E; ++UI) {
     Value *Op = UI->getOperandValToReplace();
-    const Type *UseTy = Op->getType();
+    Type *UseTy = Op->getType();
     Instruction *User = UI->getUser();
 
     // Compute the final addrec to expand into code.
@@ -692,18 +677,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) {
     // hoist loop invariants out of the loop. For PHI nodes, there may be
     // multiple uses, so compute the nearest common dominator for the
     // incoming blocks.
-    Instruction *InsertPt = User;
-    if (PHINode *PHI = dyn_cast<PHINode>(InsertPt))
-      for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
-        if (PHI->getIncomingValue(i) == Op) {
-          if (InsertPt == User)
-            InsertPt = PHI->getIncomingBlock(i)->getTerminator();
-          else
-            InsertPt =
-              DT->findNearestCommonDominator(InsertPt->getParent(),
-                                             PHI->getIncomingBlock(i))
-                    ->getTerminator();
-        }
+    Instruction *InsertPt = getInsertPointForUses(User, Op, DT);
 
     // Now expand it into actual Instructions and patch it into place.
     Value *NewVal = Rewriter.expandCodeFor(AR, UseTy, InsertPt);
@@ -747,19 +721,38 @@ namespace {
   // extend operations. This information is recorded by CollectExtend and
   // provides the input to WidenIV.
   struct WideIVInfo {
-    const Type *WidestNativeType; // Widest integer type created [sz]ext
-    bool IsSigned;                // Was an sext user seen before a zext?
+    PHINode *NarrowIV;
+    Type *WidestNativeType; // Widest integer type created [sz]ext
+    bool IsSigned;          // Was an sext user seen before a zext?
 
-    WideIVInfo() : WidestNativeType(0), IsSigned(false) {}
+    WideIVInfo() : NarrowIV(0), WidestNativeType(0), IsSigned(false) {}
+  };
+
+  class WideIVVisitor : public IVVisitor {
+    ScalarEvolution *SE;
+    const TargetData *TD;
+
+  public:
+    WideIVInfo WI;
+
+    WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV,
+                  const TargetData *TData) :
+      SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; }
+
+    // Implement the interface used by simplifyUsersOfIV.
+    virtual void visitCast(CastInst *Cast);
   };
 }
 
-/// CollectExtend - Update information about the induction variable that is
+/// visitCast - Update information about the induction variable that is
 /// extended by this sign or zero extend operation. This is used to determine
 /// the final width of the IV before actually widening it.
-static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI,
-                          ScalarEvolution *SE, const TargetData *TD) {
-  const Type *Ty = Cast->getType();
+void WideIVVisitor::visitCast(CastInst *Cast) {
+  bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+  if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+    return;
+
+  Type *Ty = Cast->getType();
   uint64_t Width = SE->getTypeSizeInBits(Ty);
   if (TD && !TD->isLegalInteger(Width))
     return;
@@ -779,6 +772,21 @@ static void CollectExtend(CastInst *Cast, bool IsSigned, WideIVInfo &WI,
 }
 
 namespace {
+
+/// NarrowIVDefUse - Record a link in the Narrow IV def-use chain along with the
+/// WideIV that computes the same value as the Narrow IV def.  This avoids
+/// caching Use* pointers.
+struct NarrowIVDefUse {
+  Instruction *NarrowDef;
+  Instruction *NarrowUse;
+  Instruction *WideDef;
+
+  NarrowIVDefUse(): NarrowDef(0), NarrowUse(0), WideDef(0) {}
+
+  NarrowIVDefUse(Instruction *ND, Instruction *NU, Instruction *WD):
+    NarrowDef(ND), NarrowUse(NU), WideDef(WD) {}
+};
+
 /// WidenIV - The goal of this transform is to remove sign and zero extends
 /// without creating any new induction variables. To do this, it creates a new
 /// phi of the wider type and redirects all users, either removing extends or
@@ -787,7 +795,7 @@ namespace {
 class WidenIV {
   // Parameters
   PHINode *OrigPhi;
-  const Type *WideType;
+  Type *WideType;
   bool IsSigned;
 
   // Context
@@ -803,13 +811,13 @@ class WidenIV {
   SmallVectorImpl<WeakVH> &DeadInsts;
 
   SmallPtrSet<Instruction*,16> Widened;
-  SmallVector<std::pair<Use *, Instruction *>, 8> NarrowIVUsers;
+  SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
 public:
-  WidenIV(PHINode *PN, const WideIVInfo &WI, LoopInfo *LInfo,
+  WidenIV(const WideIVInfo &WI, LoopInfo *LInfo,
           ScalarEvolution *SEv, DominatorTree *DTree,
           SmallVectorImpl<WeakVH> &DI) :
-    OrigPhi(PN),
+    OrigPhi(WI.NarrowIV),
     WideType(WI.WidestNativeType),
     IsSigned(WI.IsSigned),
     LI(LInfo),
@@ -826,21 +834,42 @@ public:
   PHINode *CreateWideIV(SCEVExpander &Rewriter);
 
 protected:
-  Instruction *CloneIVUser(Instruction *NarrowUse,
-                           Instruction *NarrowDef,
-                           Instruction *WideDef);
+  Value *getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                   Instruction *Use);
+
+  Instruction *CloneIVUser(NarrowIVDefUse DU);
 
   const SCEVAddRecExpr *GetWideRecurrence(Instruction *NarrowUse);
 
-  Instruction *WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
-                          Instruction *WideDef);
+  const SCEVAddRecExpr* GetExtendedOperandRecurrence(NarrowIVDefUse DU);
+
+  Instruction *WidenIVUse(NarrowIVDefUse DU);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
 } // anonymous namespace
 
-static Value *getExtend( Value *NarrowOper, const Type *WideType,
-                               bool IsSigned, IRBuilder<> &Builder) {
+/// isLoopInvariant - Perform a quick domtree based check for loop invariance
+/// assuming that V is used within the loop. LoopInfo::isLoopInvariant() seems
+/// gratuitous for this purpose.
+static bool isLoopInvariant(Value *V, const Loop *L, const DominatorTree *DT) {
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (!Inst)
+    return true;
+
+  return DT->properlyDominates(Inst->getParent(), L->getHeader());
+}
+
+Value *WidenIV::getExtend(Value *NarrowOper, Type *WideType, bool IsSigned,
+                          Instruction *Use) {
+  // Set the debug location and conservative insertion point.
+  IRBuilder<> Builder(Use);
+  // Hoist the insertion point into loop preheaders as far as possible.
+  for (const Loop *L = LI->getLoopFor(Use->getParent());
+       L && L->getLoopPreheader() && isLoopInvariant(NarrowOper, L, DT);
+       L = L->getParentLoop())
+    Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+
   return IsSigned ? Builder.CreateSExt(NarrowOper, WideType) :
                     Builder.CreateZExt(NarrowOper, WideType);
 }
@@ -848,10 +877,8 @@ static Value *getExtend( Value *NarrowOper, const Type *WideType,
 /// CloneIVUser - Instantiate a wide operation to replace a narrow
 /// operation. This only needs to handle operations that can evaluation to
 /// SCEVAddRec. It can safely return 0 for any operation we decide not to clone.
-Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
-                                  Instruction *NarrowDef,
-                                  Instruction *WideDef) {
-  unsigned Opcode = NarrowUse->getOpcode();
+Instruction *WidenIV::CloneIVUser(NarrowIVDefUse DU) {
+  unsigned Opcode = DU.NarrowUse->getOpcode();
   switch (Opcode) {
   default:
     return 0;
@@ -865,24 +892,23 @@ Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
   case Instruction::Shl:
   case Instruction::LShr:
   case Instruction::AShr:
-    DEBUG(dbgs() << "Cloning IVUser: " << *NarrowUse << "\n");
-
-    IRBuilder<> Builder(NarrowUse);
+    DEBUG(dbgs() << "Cloning IVUser: " << *DU.NarrowUse << "\n");
 
     // Replace NarrowDef operands with WideDef. Otherwise, we don't know
     // anything about the narrow operand yet so must insert a [sz]ext. It is
     // probably loop invariant and will be folded or hoisted. If it actually
     // comes from a widened IV, it should be removed during a future call to
     // WidenIVUse.
-    Value *LHS = (NarrowUse->getOperand(0) == NarrowDef) ? WideDef :
-      getExtend(NarrowUse->getOperand(0), WideType, IsSigned, Builder);
-    Value *RHS = (NarrowUse->getOperand(1) == NarrowDef) ? WideDef :
-      getExtend(NarrowUse->getOperand(1), WideType, IsSigned, Builder);
+    Value *LHS = (DU.NarrowUse->getOperand(0) == DU.NarrowDef) ? DU.WideDef :
+      getExtend(DU.NarrowUse->getOperand(0), WideType, IsSigned, DU.NarrowUse);
+    Value *RHS = (DU.NarrowUse->getOperand(1) == DU.NarrowDef) ? DU.WideDef :
+      getExtend(DU.NarrowUse->getOperand(1), WideType, IsSigned, DU.NarrowUse);
 
-    BinaryOperator *NarrowBO = cast<BinaryOperator>(NarrowUse);
+    BinaryOperator *NarrowBO = cast<BinaryOperator>(DU.NarrowUse);
     BinaryOperator *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(),
                                                     LHS, RHS,
                                                     NarrowBO->getName());
+    IRBuilder<> Builder(DU.NarrowUse);
     Builder.Insert(WideBO);
     if (const OverflowingBinaryOperator *OBO =
         dyn_cast<OverflowingBinaryOperator>(NarrowBO)) {
@@ -894,45 +920,46 @@ Instruction *WidenIV::CloneIVUser(Instruction *NarrowUse,
   llvm_unreachable(0);
 }
 
-/// HoistStep - Attempt to hoist an IV increment above a potential use.
-///
-/// To successfully hoist, two criteria must be met:
-/// - IncV operands dominate InsertPos and
-/// - InsertPos dominates IncV
-///
-/// Meeting the second condition means that we don't need to check all of IncV's
-/// existing uses (it's moving up in the domtree).
-///
-/// This does not yet recursively hoist the operands, although that would
-/// not be difficult.
-static bool HoistStep(Instruction *IncV, Instruction *InsertPos,
-                      const DominatorTree *DT)
-{
-  if (DT->dominates(IncV, InsertPos))
-    return true;
+/// No-wrap operations can transfer sign extension of their result to their
+/// operands. Generate the SCEV value for the widened operation without
+/// actually modifying the IR yet. If the expression after extending the
+/// operands is an AddRec for this loop, return it.
+const SCEVAddRecExpr* WidenIV::GetExtendedOperandRecurrence(NarrowIVDefUse DU) {
+  // Handle the common case of add<nsw/nuw>
+  if (DU.NarrowUse->getOpcode() != Instruction::Add)
+    return 0;
 
-  if (!DT->dominates(InsertPos->getParent(), IncV->getParent()))
-    return false;
+  // One operand (NarrowDef) has already been extended to WideDef. Now determine
+  // if extending the other will lead to a recurrence.
+  unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = 0;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(DU.NarrowUse);
+  if (IsSigned && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(!IsSigned && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return 0;
 
-  if (IncV->mayHaveSideEffects())
-    return false;
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(
+    SE->getAddExpr(SE->getSCEV(DU.WideDef), ExtendOperExpr,
+                   IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW));
 
-  // Attempt to hoist IncV
-  for (User::op_iterator OI = IncV->op_begin(), OE = IncV->op_end();
-       OI != OE; ++OI) {
-    Instruction *OInst = dyn_cast<Instruction>(OI);
-    if (OInst && !DT->dominates(OInst, InsertPos))
-      return false;
-  }
-  IncV->moveBefore(InsertPos);
-  return true;
+  if (!AddRec || AddRec->getLoop() != L)
+    return 0;
+  return AddRec;
 }
 
-// GetWideRecurrence - Is this instruction potentially interesting from IVUsers'
-// perspective after widening it's type? In other words, can the extend be
-// safely hoisted out of the loop with SCEV reducing the value to a recurrence
-// on the same loop. If so, return the sign or zero extended
-// recurrence. Otherwise return NULL.
+/// GetWideRecurrence - Is this instruction potentially interesting from
+/// IVUsers' perspective after widening it's type? In other words, can the
+/// extend be safely hoisted out of the loop with SCEV reducing the value to a
+/// recurrence on the same loop. If so, return the sign or zero extended
+/// recurrence. Otherwise return NULL.
 const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   if (!SE->isSCEVable(NarrowUse->getType()))
     return 0;
@@ -951,47 +978,45 @@ const SCEVAddRecExpr *WidenIV::GetWideRecurrence(Instruction *NarrowUse) {
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
   if (!AddRec || AddRec->getLoop() != L)
     return 0;
-
   return AddRec;
 }
 
 /// WidenIVUse - Determine whether an individual user of the narrow IV can be
 /// widened. If so, return the wide clone of the user.
-Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
-                                 Instruction *WideDef) {
-  Instruction *NarrowUse = cast<Instruction>(NarrowDefUse.getUser());
+Instruction *WidenIV::WidenIVUse(NarrowIVDefUse DU) {
 
   // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
-  if (isa<PHINode>(NarrowUse) && LI->getLoopFor(NarrowUse->getParent()) != L)
+  if (isa<PHINode>(DU.NarrowUse) &&
+      LI->getLoopFor(DU.NarrowUse->getParent()) != L)
     return 0;
 
   // Our raison d'etre! Eliminate sign and zero extension.
-  if (IsSigned ? isa<SExtInst>(NarrowUse) : isa<ZExtInst>(NarrowUse)) {
-    Value *NewDef = WideDef;
-    if (NarrowUse->getType() != WideType) {
-      unsigned CastWidth = SE->getTypeSizeInBits(NarrowUse->getType());
+  if (IsSigned ? isa<SExtInst>(DU.NarrowUse) : isa<ZExtInst>(DU.NarrowUse)) {
+    Value *NewDef = DU.WideDef;
+    if (DU.NarrowUse->getType() != WideType) {
+      unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
       unsigned IVWidth = SE->getTypeSizeInBits(WideType);
       if (CastWidth < IVWidth) {
         // The cast isn't as wide as the IV, so insert a Trunc.
-        IRBuilder<> Builder(NarrowDefUse);
-        NewDef = Builder.CreateTrunc(WideDef, NarrowUse->getType());
+        IRBuilder<> Builder(DU.NarrowUse);
+        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
       }
       else {
         // A wider extend was hidden behind a narrower one. This may induce
         // another round of IV widening in which the intermediate IV becomes
         // dead. It should be very rare.
         DEBUG(dbgs() << "INDVARS: New IV " << *WidePhi
-              << " not wide enough to subsume " << *NarrowUse << "\n");
-        NarrowUse->replaceUsesOfWith(NarrowDef, WideDef);
-        NewDef = NarrowUse;
+              << " not wide enough to subsume " << *DU.NarrowUse << "\n");
+        DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, DU.WideDef);
+        NewDef = DU.NarrowUse;
       }
     }
-    if (NewDef != NarrowUse) {
-      DEBUG(dbgs() << "INDVARS: eliminating " << *NarrowUse
-            << " replaced by " << *WideDef << "\n");
+    if (NewDef != DU.NarrowUse) {
+      DEBUG(dbgs() << "INDVARS: eliminating " << *DU.NarrowUse
+            << " replaced by " << *DU.WideDef << "\n");
       ++NumElimExt;
-      NarrowUse->replaceAllUsesWith(NewDef);
-      DeadInsts.push_back(NarrowUse);
+      DU.NarrowUse->replaceAllUsesWith(NewDef);
+      DeadInsts.push_back(DU.NarrowUse);
     }
     // Now that the extend is gone, we want to expose it's uses for potential
     // further simplification. We don't need to directly inform SimplifyIVUsers
@@ -1004,29 +1029,32 @@ Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
   }
 
   // Does this user itself evaluate to a recurrence after widening?
-  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(NarrowUse);
+  const SCEVAddRecExpr *WideAddRec = GetWideRecurrence(DU.NarrowUse);
+  if (!WideAddRec) {
+      WideAddRec = GetExtendedOperandRecurrence(DU);
+  }
   if (!WideAddRec) {
     // This user does not evaluate to a recurence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
-    IRBuilder<> Builder(NarrowDefUse);
-    Value *Trunc = Builder.CreateTrunc(WideDef, NarrowDef->getType());
-    NarrowUse->replaceUsesOfWith(NarrowDef, Trunc);
+    IRBuilder<> Builder(getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT));
+    Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+    DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
     return 0;
   }
-  // We assume that block terminators are not SCEVable. We wouldn't want to
+  // Assume block terminators cannot evaluate to a recurrence. We can't to
   // insert a Trunc after a terminator if there happens to be a critical edge.
-  assert(NarrowUse != NarrowUse->getParent()->getTerminator() &&
+  assert(DU.NarrowUse != DU.NarrowUse->getParent()->getTerminator() &&
          "SCEV is not expected to evaluate a block terminator");
 
   // Reuse the IV increment that SCEVExpander created as long as it dominates
   // NarrowUse.
   Instruction *WideUse = 0;
-  if (WideAddRec == WideIncExpr && HoistStep(WideInc, NarrowUse, DT)) {
+  if (WideAddRec == WideIncExpr
+      && SCEVExpander::hoistStep(WideInc, DU.NarrowUse, DT))
     WideUse = WideInc;
-  }
   else {
-    WideUse = CloneIVUser(NarrowUse, NarrowDef, WideDef);
+    WideUse = CloneIVUser(DU);
     if (!WideUse)
       return 0;
   }
@@ -1051,13 +1079,13 @@ Instruction *WidenIV::WidenIVUse(Use &NarrowDefUse, Instruction *NarrowDef,
 void WidenIV::pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef) {
   for (Value::use_iterator UI = NarrowDef->use_begin(),
          UE = NarrowDef->use_end(); UI != UE; ++UI) {
-    Use &U = UI.getUse();
+    Instruction *NarrowUse = cast<Instruction>(*UI);
 
     // Handle data flow merges and bizarre phi cycles.
-    if (!Widened.insert(cast<Instruction>(U.getUser())))
+    if (!Widened.insert(NarrowUse))
       continue;
 
-    NarrowIVUsers.push_back(std::make_pair(&UI.getUse(), WideDef));
+    NarrowIVUsers.push_back(NarrowIVDefUse(NarrowDef, NarrowUse, WideDef));
   }
 }
 
@@ -1124,23 +1152,19 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
   pushNarrowIVUsers(OrigPhi, WidePhi);
 
   while (!NarrowIVUsers.empty()) {
-    Use *UsePtr;
-    Instruction *WideDef;
-    tie(UsePtr, WideDef) = NarrowIVUsers.pop_back_val();
-    Use &NarrowDefUse = *UsePtr;
+    NarrowIVDefUse DU = NarrowIVUsers.pop_back_val();
 
     // Process a def-use edge. This may replace the use, so don't hold a
     // use_iterator across it.
-    Instruction *NarrowDef = cast<Instruction>(NarrowDefUse.get());
-    Instruction *WideUse = WidenIVUse(NarrowDefUse, NarrowDef, WideDef);
+    Instruction *WideUse = WidenIVUse(DU);
 
     // Follow all def-use edges from the previous narrow use.
     if (WideUse)
-      pushNarrowIVUsers(cast<Instruction>(NarrowDefUse.getUser()), WideUse);
+      pushNarrowIVUsers(DU.NarrowUse, WideUse);
 
     // WidenIVUse may have removed the def-use edge.
-    if (NarrowDef->use_empty())
-      DeadInsts.push_back(NarrowDef);
+    if (DU.NarrowDef->use_empty())
+      DeadInsts.push_back(DU.NarrowDef);
   }
   return WidePhi;
 }
@@ -1149,187 +1173,17 @@ PHINode *WidenIV::CreateWideIV(SCEVExpander &Rewriter) {
 //  Simplification of IV users based on SCEV evaluation.
 //===----------------------------------------------------------------------===//
 
-void IndVarSimplify::EliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
-  unsigned IVOperIdx = 0;
-  ICmpInst::Predicate Pred = ICmp->getPredicate();
-  if (IVOperand != ICmp->getOperand(0)) {
-    // Swapped
-    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
-    IVOperIdx = 1;
-    Pred = ICmpInst::getSwappedPredicate(Pred);
-  }
-
-  // Get the SCEVs for the ICmp operands.
-  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
-  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
-
-  // Simplify unnecessary loops away.
-  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
-  S = SE->getSCEVAtScope(S, ICmpLoop);
-  X = SE->getSCEVAtScope(X, ICmpLoop);
-
-  // If the condition is always true or always false, replace it with
-  // a constant value.
-  if (SE->isKnownPredicate(Pred, S, X))
-    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
-  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
-    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
-  else
-    return;
-
-  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
-  ++NumElimCmp;
-  Changed = true;
-  DeadInsts.push_back(ICmp);
-}
-
-void IndVarSimplify::EliminateIVRemainder(BinaryOperator *Rem,
-                                          Value *IVOperand,
-                                          bool IsSigned) {
-  // We're only interested in the case where we know something about
-  // the numerator.
-  if (IVOperand != Rem->getOperand(0))
-    return;
-
-  // Get the SCEVs for the ICmp operands.
-  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
-  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
-
-  // Simplify unnecessary loops away.
-  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
-  S = SE->getSCEVAtScope(S, ICmpLoop);
-  X = SE->getSCEVAtScope(X, ICmpLoop);
-
-  // i % n  -->  i  if i is in [0,n).
-  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
-      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                           S, X))
-    Rem->replaceAllUsesWith(Rem->getOperand(0));
-  else {
-    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
-    const SCEV *LessOne =
-      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
-    if (IsSigned && !SE->isKnownNonNegative(LessOne))
-      return;
-
-    if (!SE->isKnownPredicate(IsSigned ?
-                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
-                              LessOne, X))
-      return;
-
-    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
-                                  Rem->getOperand(0), Rem->getOperand(1),
-                                  "tmp");
-    SelectInst *Sel =
-      SelectInst::Create(ICmp,
-                         ConstantInt::get(Rem->getType(), 0),
-                         Rem->getOperand(0), "tmp", Rem);
-    Rem->replaceAllUsesWith(Sel);
-  }
-
-  // Inform IVUsers about the new users.
-  if (IU) {
-    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
-      IU->AddUsersIfInteresting(I);
-  }
-  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
-  ++NumElimRem;
-  Changed = true;
-  DeadInsts.push_back(Rem);
-}
-
-/// EliminateIVUser - Eliminate an operation that consumes a simple IV and has
-/// no observable side-effect given the range of IV values.
-bool IndVarSimplify::EliminateIVUser(Instruction *UseInst,
-                                     Instruction *IVOperand) {
-  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
-    EliminateIVComparison(ICmp, IVOperand);
-    return true;
-  }
-  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-      EliminateIVRemainder(Rem, IVOperand, IsSigned);
-      return true;
-    }
-  }
-
-  // Eliminate any operation that SCEV can prove is an identity function.
-  if (!SE->isSCEVable(UseInst->getType()) ||
-      (UseInst->getType() != IVOperand->getType()) ||
-      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
-    return false;
-
-  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
-
-  UseInst->replaceAllUsesWith(IVOperand);
-  ++NumElimIdentity;
-  Changed = true;
-  DeadInsts.push_back(UseInst);
-  return true;
-}
-
-/// pushIVUsers - Add all uses of Def to the current IV's worklist.
-///
-static void pushIVUsers(
-  Instruction *Def,
-  SmallPtrSet<Instruction*,16> &Simplified,
-  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
-
-  for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end();
-       UI != E; ++UI) {
-    Instruction *User = cast<Instruction>(*UI);
-
-    // Avoid infinite or exponential worklist processing.
-    // Also ensure unique worklist users.
-    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
-    // self edges first.
-    if (User != Def && Simplified.insert(User))
-      SimpleIVUsers.push_back(std::make_pair(User, Def));
-  }
-}
-
-/// isSimpleIVUser - Return true if this instruction generates a simple SCEV
-/// expression in terms of that IV.
-///
-/// This is similar to IVUsers' isInsteresting() but processes each instruction
-/// non-recursively when the operand is already known to be a simpleIVUser.
-///
-static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
-  if (!SE->isSCEVable(I->getType()))
-    return false;
-
-  // Get the symbolic expression for this instruction.
-  const SCEV *S = SE->getSCEV(I);
-
-  // We assume that terminators are not SCEVable.
-  assert((!S || I != I->getParent()->getTerminator()) &&
-         "can't fold terminators");
-
-  // Only consider affine recurrences.
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
-  if (AR && AR->getLoop() == L)
-    return true;
-
-  return false;
-}
 
-/// SimplifyIVUsersNoRewrite - Iteratively perform simplification on a worklist
-/// of IV users. Each successive simplification may push more users which may
+/// SimplifyAndExtend - Iteratively perform simplification on a worklist of IV
+/// users. Each successive simplification may push more users which may
 /// themselves be candidates for simplification.
 ///
-/// The "NoRewrite" algorithm does not require IVUsers analysis. Instead, it
-/// simplifies instructions in-place during analysis. Rather than rewriting
-/// induction variables bottom-up from their users, it transforms a chain of
-/// IVUsers top-down, updating the IR only when it encouters a clear
-/// optimization opportunitiy. A SCEVExpander "Rewriter" instance is still
-/// needed, but only used to generate a new IV (phi) of wider type for sign/zero
-/// extend elimination.
+/// Sign/Zero extend elimination is interleaved with IV simplification.
 ///
-/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
-///
-void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) {
-  std::map<PHINode *, WideIVInfo> WideIVMap;
+void IndVarSimplify::SimplifyAndExtend(Loop *L,
+                                       SCEVExpander &Rewriter,
+                                       LPPassManager &LPM) {
+  SmallVector<WideIVInfo, 8> WideIVs;
 
   SmallVector<PHINode*, 8> LoopPhis;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
@@ -1345,108 +1199,81 @@ void IndVarSimplify::SimplifyIVUsersNoRewrite(Loop *L, SCEVExpander &Rewriter) {
     // extension. The first time SCEV attempts to normalize sign/zero extension,
     // the result becomes final. So for the most predictable results, we delay
     // evaluation of sign/zero extend evaluation until needed, and avoid running
-    // other SCEV based analysis prior to SimplifyIVUsersNoRewrite.
+    // other SCEV based analysis prior to SimplifyAndExtend.
     do {
       PHINode *CurrIV = LoopPhis.pop_back_val();
 
       // Information about sign/zero extensions of CurrIV.
-      WideIVInfo WI;
-
-      // Instructions processed by SimplifyIVUsers for CurrIV.
-      SmallPtrSet<Instruction*,16> Simplified;
-
-      // Use-def pairs if IV users waiting to be processed for CurrIV.
-      SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
-
-      // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
-      // called multiple times for the same LoopPhi. This is the proper thing to
-      // do for loop header phis that use each other.
-      pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
+      WideIVVisitor WIV(CurrIV, SE, TD);
 
-      while (!SimpleIVUsers.empty()) {
-        Instruction *UseInst, *Operand;
-        tie(UseInst, Operand) = SimpleIVUsers.pop_back_val();
-        // Bypass back edges to avoid extra work.
-        if (UseInst == CurrIV) continue;
+      Changed |= simplifyUsersOfIV(CurrIV, SE, &LPM, DeadInsts, &WIV);
 
-        if (EliminateIVUser(UseInst, Operand)) {
-          pushIVUsers(Operand, Simplified, SimpleIVUsers);
-          continue;
-        }
-        if (CastInst *Cast = dyn_cast<CastInst>(UseInst)) {
-          bool IsSigned = Cast->getOpcode() == Instruction::SExt;
-          if (IsSigned || Cast->getOpcode() == Instruction::ZExt) {
-            CollectExtend(Cast, IsSigned, WI, SE, TD);
-          }
-          continue;
-        }
-        if (isSimpleIVUser(UseInst, L, SE)) {
-          pushIVUsers(UseInst, Simplified, SimpleIVUsers);
-        }
-      }
-      if (WI.WidestNativeType) {
-        WideIVMap[CurrIV] = WI;
+      if (WIV.WI.WidestNativeType) {
+        WideIVs.push_back(WIV.WI);
       }
     } while(!LoopPhis.empty());
 
-    for (std::map<PHINode *, WideIVInfo>::const_iterator I = WideIVMap.begin(),
-           E = WideIVMap.end(); I != E; ++I) {
-      WidenIV Widener(I->first, I->second, LI, SE, DT, DeadInsts);
+    for (; !WideIVs.empty(); WideIVs.pop_back()) {
+      WidenIV Widener(WideIVs.back(), LI, SE, DT, DeadInsts);
       if (PHINode *WidePhi = Widener.CreateWideIV(Rewriter)) {
         Changed = true;
         LoopPhis.push_back(WidePhi);
       }
     }
-    WideIVMap.clear();
   }
 }
 
-/// SimplifyCongruentIVs - Check for congruent phis in this loop header and
-/// populate ExprToIVMap for use later.
-///
-void IndVarSimplify::SimplifyCongruentIVs(Loop *L) {
-  DenseMap<const SCEV *, PHINode *> ExprToIVMap;
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    PHINode *Phi = cast<PHINode>(I);
-    if (!SE->isSCEVable(Phi->getType()))
-      continue;
+//===----------------------------------------------------------------------===//
+//  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
 
-    const SCEV *S = SE->getSCEV(Phi);
-    DenseMap<const SCEV *, PHINode *>::const_iterator Pos;
-    bool Inserted;
-    tie(Pos, Inserted) = ExprToIVMap.insert(std::make_pair(S, Phi));
-    if (Inserted)
-      continue;
-    PHINode *OrigPhi = Pos->second;
-    // Replacing the congruent phi is sufficient because acyclic redundancy
-    // elimination, CSE/GVN, should handle the rest. However, once SCEV proves
-    // that a phi is congruent, it's almost certain to be the head of an IV
-    // user cycle that is isomorphic with the original phi. So it's worth
-    // eagerly cleaning up the common case of a single IV increment.
-    if (BasicBlock *LatchBlock = L->getLoopLatch()) {
-      Instruction *OrigInc =
-        cast<Instruction>(OrigPhi->getIncomingValueForBlock(LatchBlock));
-      Instruction *IsomorphicInc =
-        cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
-      if (OrigInc != IsomorphicInc &&
-          SE->getSCEV(OrigInc) == SE->getSCEV(IsomorphicInc) &&
-          HoistStep(OrigInc, IsomorphicInc, DT)) {
-        DEBUG(dbgs() << "INDVARS: Eliminated congruent iv.inc: "
-              << *IsomorphicInc << '\n');
-        IsomorphicInc->replaceAllUsesWith(OrigInc);
-        DeadInsts.push_back(IsomorphicInc);
-      }
+/// Check for expressions that ScalarEvolution generates to compute
+/// BackedgeTakenInfo. If these expressions have not been reduced, then
+/// expanding them may incur additional cost (albeit in the loop preheader).
+static bool isHighCostExpansion(const SCEV *S, BranchInst *BI,
+                                ScalarEvolution *SE) {
+  // If the backedge-taken count is a UDiv, it's very likely a UDiv that
+  // ScalarEvolution's HowFarToZero or HowManyLessThans produced to compute a
+  // precise expression, rather than a UDiv from the user's code. If we can't
+  // find a UDiv in the code with some simple searching, assume the former and
+  // forego rewriting the loop.
+  if (isa<SCEVUDivExpr>(S)) {
+    ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition());
+    if (!OrigCond) return true;
+    const SCEV *R = SE->getSCEV(OrigCond->getOperand(1));
+    R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1));
+    if (R != S) {
+      const SCEV *L = SE->getSCEV(OrigCond->getOperand(0));
+      L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1));
+      if (L != S)
+        return true;
     }
-    DEBUG(dbgs() << "INDVARS: Eliminated congruent iv: " << *Phi << '\n');
-    ++NumElimIV;
-    Phi->replaceAllUsesWith(OrigPhi);
-    DeadInsts.push_back(Phi);
   }
-}
 
-//===----------------------------------------------------------------------===//
-//  LinearFunctionTestReplace and its kin. Rewrite the loop exit condition.
-//===----------------------------------------------------------------------===//
+  if (EnableIVRewrite)
+    return false;
+
+  // Recurse past add expressions, which commonly occur in the
+  // BackedgeTakenCount. They may already exist in program code, and if not,
+  // they are not too expensive rematerialize.
+  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
+         I != E; ++I) {
+      if (isHighCostExpansion(*I, BI, SE))
+        return true;
+    }
+    return false;
+  }
+
+  // HowManyLessThans uses a Max expression whenever the loop is not guarded by
+  // the exit condition.
+  if (isa<SCEVSMaxExpr>(S) || isa<SCEVUMaxExpr>(S))
+    return true;
+
+  // If we haven't recognized an expensive SCEV patter, assume its an expression
+  // produced by program code.
+  return false;
+}
 
 /// canExpandBackedgeTakenCount - Return true if this loop's backedge taken
 /// count expression can be safely and cheaply expanded into an instruction
@@ -1465,31 +1292,17 @@ static bool canExpandBackedgeTakenCount(Loop *L, ScalarEvolution *SE) {
   if (!BI)
     return false;
 
-  // Special case: If the backedge-taken count is a UDiv, it's very likely a
-  // UDiv that ScalarEvolution produced in order to compute a precise
-  // expression, rather than a UDiv from the user's code. If we can't find a
-  // UDiv in the code with some simple searching, assume the former and forego
-  // rewriting the loop.
-  if (isa<SCEVUDivExpr>(BackedgeTakenCount)) {
-    ICmpInst *OrigCond = dyn_cast<ICmpInst>(BI->getCondition());
-    if (!OrigCond) return false;
-    const SCEV *R = SE->getSCEV(OrigCond->getOperand(1));
-    R = SE->getMinusSCEV(R, SE->getConstant(R->getType(), 1));
-    if (R != BackedgeTakenCount) {
-      const SCEV *L = SE->getSCEV(OrigCond->getOperand(0));
-      L = SE->getMinusSCEV(L, SE->getConstant(L->getType(), 1));
-      if (L != BackedgeTakenCount)
-        return false;
-    }
-  }
+  if (isHighCostExpansion(BackedgeTakenCount, BI, SE))
+    return false;
+
   return true;
 }
 
 /// getBackedgeIVType - Get the widest type used by the loop test after peeking
 /// through Truncs.
 ///
-/// TODO: Unnecessary if LFTR does not force a canonical IV.
-static const Type *getBackedgeIVType(Loop *L) {
+/// TODO: Unnecessary when ForceLFTR is removed.
+static Type *getBackedgeIVType(Loop *L) {
   if (!L->getExitingBlock())
     return 0;
 
@@ -1502,7 +1315,7 @@ static const Type *getBackedgeIVType(Loop *L) {
   if (!Cond)
     return 0;
 
-  const Type *Ty = 0;
+  Type *Ty = 0;
   for(User::op_iterator OI = Cond->op_begin(), OE = Cond->op_end();
       OI != OE; ++OI) {
     assert((!Ty || Ty == (*OI)->getType()) && "bad icmp operand types");
@@ -1515,12 +1328,187 @@ static const Type *getBackedgeIVType(Loop *L) {
   return Ty;
 }
 
+/// getLoopPhiForCounter - Return the loop header phi IFF IncV adds a loop
+/// invariant value to the phi.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) {
+  Instruction *IncI = dyn_cast<Instruction>(IncV);
+  if (!IncI)
+    return 0;
+
+  switch (IncI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+    break;
+  case Instruction::GetElementPtr:
+    // An IV counter must preserve its type.
+    if (IncI->getNumOperands() == 2)
+      break;
+  default:
+    return 0;
+  }
+
+  PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(1), L, DT))
+      return Phi;
+    return 0;
+  }
+  if (IncI->getOpcode() == Instruction::GetElementPtr)
+    return 0;
+
+  // Allow add/sub to be commuted.
+  Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+  if (Phi && Phi->getParent() == L->getHeader()) {
+    if (isLoopInvariant(IncI->getOperand(0), L, DT))
+      return Phi;
+  }
+  return 0;
+}
+
+/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show
+/// that the current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, DominatorTree *DT) {
+  assert(L->getExitingBlock() && "expected loop exit");
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  // Don't bother with LFTR if the loop is not properly simplified.
+  if (!LatchBlock)
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator());
+  assert(BI && "expected exit branch");
+
+  // Do LFTR to simplify the exit condition to an ICMP.
+  ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+  if (!Cond)
+    return true;
+
+  // Do LFTR to simplify the exit ICMP to EQ/NE
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+    return true;
+
+  // Look for a loop invariant RHS
+  Value *LHS = Cond->getOperand(0);
+  Value *RHS = Cond->getOperand(1);
+  if (!isLoopInvariant(RHS, L, DT)) {
+    if (!isLoopInvariant(LHS, L, DT))
+      return true;
+    std::swap(LHS, RHS);
+  }
+  // Look for a simple IV counter LHS
+  PHINode *Phi = dyn_cast<PHINode>(LHS);
+  if (!Phi)
+    Phi = getLoopPhiForCounter(LHS, L, DT);
+
+  if (!Phi)
+    return true;
+
+  // Do LFTR if the exit condition's IV is *not* a simple counter.
+  Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch());
+  return Phi != getLoopPhiForCounter(IncV, L, DT);
+}
+
+/// AlmostDeadIV - Return true if this IV has any uses other than the (soon to
+/// be rewritten) loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+  int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+  Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+  for (Value::use_iterator UI = Phi->use_begin(), UE = Phi->use_end();
+       UI != UE; ++UI) {
+    if (*UI != Cond && *UI != IncV) return false;
+  }
+
+  for (Value::use_iterator UI = IncV->use_begin(), UE = IncV->use_end();
+       UI != UE; ++UI) {
+    if (*UI != Cond && *UI != Phi) return false;
+  }
+  return true;
+}
+
+/// FindLoopCounter - Find an affine IV in canonical form.
+///
+/// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount
+///
+/// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride.
+/// This is difficult in general for SCEV because of potential overflow. But we
+/// could at least handle constant BECounts.
+static PHINode *
+FindLoopCounter(Loop *L, const SCEV *BECount,
+                ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) {
+  // I'm not sure how BECount could be a pointer type, but we definitely don't
+  // want to LFTR that.
+  if (BECount->getType()->isPointerTy())
+    return 0;
+
+  uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+  Value *Cond =
+    cast<BranchInst>(L->getExitingBlock()->getTerminator())->getCondition();
+
+  // Loop over all of the PHI nodes, looking for a simple counter.
+  PHINode *BestPhi = 0;
+  const SCEV *BestInit = 0;
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  assert(LatchBlock && "needsLFTR should guarantee a loop latch");
+
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    PHINode *Phi = cast<PHINode>(I);
+    if (!SE->isSCEVable(Phi->getType()))
+      continue;
+
+    const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+    if (!AR || AR->getLoop() != L || !AR->isAffine())
+      continue;
+
+    // AR may be a pointer type, while BECount is an integer type.
+    // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+    // AR may not be a narrower type, or we may never exit.
+    uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+    if (PhiWidth < BCWidth || (TD && !TD->isLegalInteger(PhiWidth)))
+      continue;
+
+    const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+    if (!Step || !Step->isOne())
+      continue;
+
+    int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+    Value *IncV = Phi->getIncomingValue(LatchIdx);
+    if (getLoopPhiForCounter(IncV, L, DT) != Phi)
+      continue;
+
+    const SCEV *Init = AR->getStart();
+
+    if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+      // Don't force a live loop counter if another IV can be used.
+      if (AlmostDeadIV(Phi, LatchBlock, Cond))
+        continue;
+
+      // Prefer to count-from-zero. This is a more "canonical" counter form. It
+      // also prefers integer to pointer IVs.
+      if (BestInit->isZero() != Init->isZero()) {
+        if (BestInit->isZero())
+          continue;
+      }
+      // If two IVs both count from zero or both count from nonzero then the
+      // narrower is likely a dead phi that has been widened. Use the wider phi
+      // to allow the other to be eliminated.
+      if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+        continue;
+    }
+    BestPhi = Phi;
+    BestInit = Init;
+  }
+  return BestPhi;
+}
+
 /// LinearFunctionTestReplace - This method rewrites the exit condition of the
 /// loop to be a canonical != comparison against the incremented loop induction
 /// variable.  This pass is able to rewrite the exit tests of any loop where the
 /// SCEV analysis can determine a loop-invariant trip count of the loop, which
 /// is actually a much broader range than just linear tests.
-ICmpInst *IndVarSimplify::
+Value *IndVarSimplify::
 LinearFunctionTestReplace(Loop *L,
                           const SCEV *BackedgeTakenCount,
                           PHINode *IndVar,
@@ -1528,62 +1516,117 @@ LinearFunctionTestReplace(Loop *L,
   assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
   BranchInst *BI = cast<BranchInst>(L->getExitingBlock()->getTerminator());
 
+  // LFTR can ignore IV overflow and truncate to the width of
+  // BECount. This avoids materializing the add(zext(add)) expression.
+  Type *CntTy = !EnableIVRewrite ?
+    BackedgeTakenCount->getType() : IndVar->getType();
+
+  const SCEV *IVLimit = BackedgeTakenCount;
+
   // If the exiting block is not the same as the backedge block, we must compare
   // against the preincremented value, otherwise we prefer to compare against
   // the post-incremented value.
   Value *CmpIndVar;
-  const SCEV *RHS = BackedgeTakenCount;
   if (L->getExitingBlock() == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
     // If this addition may overflow, we have to be more pessimistic and
     // cast the induction variable before doing the add.
-    const SCEV *Zero = SE->getConstant(BackedgeTakenCount->getType(), 0);
     const SCEV *N =
-      SE->getAddExpr(BackedgeTakenCount,
-                     SE->getConstant(BackedgeTakenCount->getType(), 1));
-    if ((isa<SCEVConstant>(N) && !N->isZero()) ||
-        SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
-      // No overflow. Cast the sum.
-      RHS = SE->getTruncateOrZeroExtend(N, IndVar->getType());
-    } else {
-      // Potential overflow. Cast before doing the add.
-      RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
-                                        IndVar->getType());
-      RHS = SE->getAddExpr(RHS,
-                           SE->getConstant(IndVar->getType(), 1));
+      SE->getAddExpr(IVLimit, SE->getConstant(IVLimit->getType(), 1));
+    if (CntTy == IVLimit->getType())
+      IVLimit = N;
+    else {
+      const SCEV *Zero = SE->getConstant(IVLimit->getType(), 0);
+      if ((isa<SCEVConstant>(N) && !N->isZero()) ||
+          SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
+        // No overflow. Cast the sum.
+        IVLimit = SE->getTruncateOrZeroExtend(N, CntTy);
+      } else {
+        // Potential overflow. Cast before doing the add.
+        IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy);
+        IVLimit = SE->getAddExpr(IVLimit, SE->getConstant(CntTy, 1));
+      }
     }
-
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
     CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
   } else {
     // We have to use the preincremented value...
-    RHS = SE->getTruncateOrZeroExtend(BackedgeTakenCount,
-                                      IndVar->getType());
+    IVLimit = SE->getTruncateOrZeroExtend(IVLimit, CntTy);
     CmpIndVar = IndVar;
   }
 
+  // For unit stride, IVLimit = Start + BECount with 2's complement overflow.
+  // So for, non-zero start compute the IVLimit here.
+  bool isPtrIV = false;
+  Type *CmpTy = CntTy;
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+  assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
+  if (!AR->getStart()->isZero()) {
+    assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+    const SCEV *IVInit = AR->getStart();
+
+    // For pointer types, sign extend BECount in order to materialize a GEP.
+    // Note that for without EnableIVRewrite, we never run SCEVExpander on a
+    // pointer type, because we must preserve the existing GEPs. Instead we
+    // directly generate a GEP later.
+    if (IVInit->getType()->isPointerTy()) {
+      isPtrIV = true;
+      CmpTy = SE->getEffectiveSCEVType(IVInit->getType());
+      IVLimit = SE->getTruncateOrSignExtend(IVLimit, CmpTy);
+    }
+    // For integer types, truncate the IV before computing IVInit + BECount.
+    else {
+      if (SE->getTypeSizeInBits(IVInit->getType())
+          > SE->getTypeSizeInBits(CmpTy))
+        IVInit = SE->getTruncateExpr(IVInit, CmpTy);
+
+      IVLimit = SE->getAddExpr(IVInit, IVLimit);
+    }
+  }
   // Expand the code for the iteration count.
-  assert(SE->isLoopInvariant(RHS, L) &&
+  IRBuilder<> Builder(BI);
+
+  assert(SE->isLoopInvariant(IVLimit, L) &&
          "Computed iteration count is not loop invariant!");
-  Value *ExitCnt = Rewriter.expandCodeFor(RHS, IndVar->getType(), BI);
+  Value *ExitCnt = Rewriter.expandCodeFor(IVLimit, CmpTy, BI);
+
+  // Create a gep for IVInit + IVLimit from on an existing pointer base.
+  assert(isPtrIV == IndVar->getType()->isPointerTy() &&
+         "IndVar type must match IVInit type");
+  if (isPtrIV) {
+      Value *IVStart = IndVar->getIncomingValueForBlock(L->getLoopPreheader());
+      assert(AR->getStart() == SE->getSCEV(IVStart) && "bad loop counter");
+      assert(SE->getSizeOfExpr(
+               cast<PointerType>(IVStart->getType())->getElementType())->isOne()
+             && "unit stride pointer IV must be i8*");
+
+      Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+      ExitCnt = Builder.CreateGEP(IVStart, ExitCnt, "lftr.limit");
+      Builder.SetInsertPoint(BI);
+  }
 
   // Insert a new icmp_ne or icmp_eq instruction before the branch.
-  ICmpInst::Predicate Opcode;
+  ICmpInst::Predicate P;
   if (L->contains(BI->getSuccessor(0)))
-    Opcode = ICmpInst::ICMP_NE;
+    P = ICmpInst::ICMP_NE;
   else
-    Opcode = ICmpInst::ICMP_EQ;
+    P = ICmpInst::ICMP_EQ;
 
   DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
                << "      LHS:" << *CmpIndVar << '\n'
                << "       op:\t"
-               << (Opcode == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
-               << "      RHS:\t" << *RHS << "\n");
+               << (P == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
+               << "      RHS:\t" << *ExitCnt << "\n"
+               << "     Expr:\t" << *IVLimit << "\n");
+
+  if (SE->getTypeSizeInBits(CmpIndVar->getType())
+      > SE->getTypeSizeInBits(CmpTy)) {
+    CmpIndVar = Builder.CreateTrunc(CmpIndVar, CmpTy, "lftr.wideiv");
+  }
 
-  ICmpInst *Cond = new ICmpInst(BI, Opcode, CmpIndVar, ExitCnt, "exitcond");
-  Cond->setDebugLoc(BI->getDebugLoc());
+  Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
   Value *OrigCond = BI->getCondition();
   // It's tempting to use replaceAllUsesWith here to fully replace the old
   // comparison, but that's not immediately safe, since users of the old
@@ -1612,7 +1655,7 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) return;
 
-  Instruction *InsertPt = ExitBlock->getFirstNonPHI();
+  Instruction *InsertPt = ExitBlock->getFirstInsertionPt();
   BasicBlock::iterator I = Preheader->getTerminator();
   while (I != Preheader->begin()) {
     --I;
@@ -1633,6 +1676,10 @@ void IndVarSimplify::SinkUnusedInvariants(Loop *L) {
     if (isa<DbgInfoIntrinsic>(I))
       continue;
 
+    // Skip landingpad instructions.
+    if (isa<LandingPadInst>(I))
+      continue;
+
     // Don't sink static AllocaInsts out of the entry block, which would
     // turn them into dynamic allocas!
     if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
@@ -1699,7 +1746,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (!L->isLoopSimplifyForm())
     return false;
 
-  if (!DisableIVRewrite)
+  if (EnableIVRewrite)
     IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
@@ -1717,6 +1764,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE, "indvars");
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
 
   // Eliminate redundant IV users.
   //
@@ -1724,9 +1774,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // attempt to avoid evaluating SCEVs for sign/zero extend operations until
   // other expressions involving loop IVs have been evaluated. This helps SCEV
   // set no-wrap flags before normalizing sign/zero extension.
-  if (DisableIVRewrite) {
+  if (!EnableIVRewrite) {
     Rewriter.disableCanonicalMode();
-    SimplifyIVUsersNoRewrite(L, Rewriter);
+    SimplifyAndExtend(L, Rewriter, LPM);
   }
 
   // Check to see if this loop has a computable loop-invariant execution count.
@@ -1739,25 +1789,25 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
     RewriteLoopExitValues(L, Rewriter);
 
   // Eliminate redundant IV users.
-  if (!DisableIVRewrite)
-    SimplifyIVUsers(Rewriter);
+  if (EnableIVRewrite)
+    Changed |= simplifyIVUsers(IU, SE, &LPM, DeadInsts);
 
   // Eliminate redundant IV cycles.
-  if (DisableIVRewrite)
-    SimplifyCongruentIVs(L);
+  if (!EnableIVRewrite)
+    NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
 
   // Compute the type of the largest recurrence expression, and decide whether
   // a canonical induction variable should be inserted.
-  const Type *LargestType = 0;
+  Type *LargestType = 0;
   bool NeedCannIV = false;
   bool ExpandBECount = canExpandBackedgeTakenCount(L, SE);
-  if (ExpandBECount) {
+  if (EnableIVRewrite && ExpandBECount) {
     // If we have a known trip count and a single exit block, we'll be
     // rewriting the loop exit test condition below, which requires a
     // canonical induction variable.
     NeedCannIV = true;
-    const Type *Ty = BackedgeTakenCount->getType();
-    if (DisableIVRewrite) {
+    Type *Ty = BackedgeTakenCount->getType();
+    if (!EnableIVRewrite) {
       // In this mode, SimplifyIVUsers may have already widened the IV used by
       // the backedge test and inserted a Trunc on the compare's operand. Get
       // the wider type to avoid creating a redundant narrow IV only used by the
@@ -1769,10 +1819,10 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
         SE->getTypeSizeInBits(LargestType))
       LargestType = SE->getEffectiveSCEVType(Ty);
   }
-  if (!DisableIVRewrite) {
+  if (EnableIVRewrite) {
     for (IVUsers::const_iterator I = IU->begin(), E = IU->end(); I != E; ++I) {
       NeedCannIV = true;
-      const Type *Ty =
+      Type *Ty =
         SE->getEffectiveSCEVType(I->getOperandValToReplace()->getType());
       if (!LargestType ||
           SE->getTypeSizeInBits(Ty) >
@@ -1811,18 +1861,16 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
     // the end of the pass.
     while (!OldCannIVs.empty()) {
       PHINode *OldCannIV = OldCannIVs.pop_back_val();
-      OldCannIV->insertBefore(L->getHeader()->getFirstNonPHI());
+      OldCannIV->insertBefore(L->getHeader()->getFirstInsertionPt());
     }
   }
-
+  else if (!EnableIVRewrite && ExpandBECount && needsLFTR(L, DT)) {
+    IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT, TD);
+  }
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
-  ICmpInst *NewICmp = 0;
-  if (ExpandBECount) {
-    assert(canExpandBackedgeTakenCount(L, SE) &&
-           "canonical IV disrupted BackedgeTaken expansion");
-    assert(NeedCannIV &&
-           "LinearFunctionTestReplace requires a canonical induction variable");
+  Value *NewICmp = 0;
+  if (ExpandBECount && IndVar) {
     // Check preconditions for proper SCEVExpander operation. SCEV does not
     // express SCEVExpander's dependencies, such as LoopSimplify. Instead any
     // pass that uses the SCEVExpander must do it. This does not work well for
@@ -1837,7 +1885,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
         LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar, Rewriter);
   }
   // Rewrite IV-derived expressions.
-  if (!DisableIVRewrite)
+  if (EnableIVRewrite)
     RewriteIVExpressions(L, Rewriter);
 
   // Clear the rewriter cache, because values that are in the rewriter's cache
@@ -1860,12 +1908,34 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // For completeness, inform IVUsers of the IV use in the newly-created
   // loop exit test instruction.
-  if (NewICmp && IU)
-    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)));
-
+  if (IU && NewICmp) {
+    ICmpInst *NewICmpInst = dyn_cast<ICmpInst>(NewICmp);
+    if (NewICmpInst)
+      IU->AddUsersIfInteresting(cast<Instruction>(NewICmpInst->getOperand(0)));
+  }
   // Clean up dead instructions.
   Changed |= DeleteDeadPHIs(L->getHeader());
   // Check a post-condition.
-  assert(L->isLCSSAForm(*DT) && "Indvars did not leave the loop in lcssa form!");
+  assert(L->isLCSSAForm(*DT) &&
+         "Indvars did not leave the loop in lcssa form!");
+
+  // Verify that LFTR, and any other change have not interfered with SCEV's
+  // ability to compute trip count.
+#ifndef NDEBUG
+  if (!EnableIVRewrite && VerifyIndvars &&
+      !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    SE->forgetLoop(L);
+    const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+    if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+        SE->getTypeSizeInBits(NewBECount->getType()))
+      NewBECount = SE->getTruncateOrNoop(NewBECount,
+                                         BackedgeTakenCount->getType());
+    else
+      BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+                                                 NewBECount->getType());
+    assert(BackedgeTakenCount == NewBECount && "indvars must preserve SCEV");
+  }
+#endif
+
   return Changed;
 }
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b500d5b..f410af3 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -811,8 +811,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
 /// important optimization that encourages jump threading, and needs to be run
 /// interlaced with other jump threading tasks.
 bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
-  // Don't hack volatile loads.
-  if (LI->isVolatile()) return false;
+  // Don't hack volatile/atomic loads.
+  if (!LI->isSimple()) return false;
 
   // If the load is defined in a block with exactly one predecessor, it can't be
   // partially redundant.
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 66add6c..b79bb13 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -151,6 +151,11 @@ namespace {
     ///
     bool isSafeToExecuteUnconditionally(Instruction &I);
 
+    /// isGuaranteedToExecute - Check that the instruction is guaranteed to
+    /// execute.
+    ///
+    bool isGuaranteedToExecute(Instruction &I);
+
     /// pointerInvalidatedByLoop - Return true if the body of this loop may
     /// store into the memory location pointed to by V.
     ///
@@ -357,8 +362,8 @@ void LICM::HoistRegion(DomTreeNode *N) {
 bool LICM::canSinkOrHoistInst(Instruction &I) {
   // Loads have extra constraints we have to verify before we can hoist them.
   if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-    if (LI->isVolatile())
-      return false;        // Don't hoist volatile loads!
+    if (!LI->isUnordered())
+      return false;        // Don't hoist volatile/atomic loads!
 
     // Loads from constant memory are always safe to move, even if they end up
     // in the same alias set as something that ends up being modified.
@@ -461,7 +466,7 @@ void LICM::sink(Instruction &I) {
     } else {
       // Move the instruction to the start of the exit block, after any PHI
       // nodes in it.
-      I.moveBefore(ExitBlocks[0]->getFirstNonPHI());
+      I.moveBefore(ExitBlocks[0]->getFirstInsertionPt());
 
       // This instruction is no longer in the AST for the current loop, because
       // we just sunk it out of the loop.  If we just sunk it into an outer
@@ -504,7 +509,7 @@ void LICM::sink(Instruction &I) {
       continue;
 
     // Insert the code after the last PHI node.
-    BasicBlock::iterator InsertPt = ExitBlock->getFirstNonPHI();
+    BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
 
     // If this is the first exit block processed, just move the original
     // instruction, otherwise clone the original instruction and insert
@@ -577,6 +582,10 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
   if (Inst.isSafeToSpeculativelyExecute())
     return true;
 
+  return isGuaranteedToExecute(Inst);
+}
+
+bool LICM::isGuaranteedToExecute(Instruction &Inst) {
   // Otherwise we have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
@@ -635,7 +644,7 @@ namespace {
       for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
         BasicBlock *ExitBlock = LoopExitBlocks[i];
         Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
-        Instruction *InsertPos = ExitBlock->getFirstNonPHI();
+        Instruction *InsertPos = ExitBlock->getFirstInsertionPt();
         StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos);
         NewSI->setAlignment(Alignment);
         NewSI->setDebugLoc(DL);
@@ -713,34 +722,41 @@ void LICM::PromoteAliasSet(AliasSet &AS) {
 
       // If there is an non-load/store instruction in the loop, we can't promote
       // it.
-      unsigned InstAlignment;
       if (LoadInst *load = dyn_cast<LoadInst>(Use)) {
-        assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken");
-        InstAlignment = load->getAlignment();
+        assert(!load->isVolatile() && "AST broken");
+        if (!load->isSimple())
+          return;
       } else if (StoreInst *store = dyn_cast<StoreInst>(Use)) {
         // Stores *of* the pointer are not interesting, only stores *to* the
         // pointer.
         if (Use->getOperand(1) != ASIV)
           continue;
-        InstAlignment = store->getAlignment();
-        assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken");
+        assert(!store->isVolatile() && "AST broken");
+        if (!store->isSimple())
+          return;
+
+        // Note that we only check GuaranteedToExecute inside the store case
+        // so that we do not introduce stores where they did not exist before
+        // (which would break the LLVM concurrency model).
+
+        // If the alignment of this instruction allows us to specify a more
+        // restrictive (and performant) alignment and if we are sure this
+        // instruction will be executed, update the alignment.
+        // Larger is better, with the exception of 0 being the best alignment.
+        unsigned InstAlignment = store->getAlignment();
+        if ((InstAlignment > Alignment || InstAlignment == 0)
+            && (Alignment != 0))
+          if (isGuaranteedToExecute(*Use)) {
+            GuaranteedToExecute = true;
+            Alignment = InstAlignment;
+          }
+
+        if (!GuaranteedToExecute)
+          GuaranteedToExecute = isGuaranteedToExecute(*Use);
+
       } else
         return; // Not a load or store.
 
-      // If the alignment of this instruction allows us to specify a more
-      // restrictive (and performant) alignment and if we are sure this
-      // instruction will be executed, update the alignment.
-      // Larger is better, with the exception of 0 being the best alignment.
-      if ((InstAlignment > Alignment || InstAlignment == 0)
-          && (Alignment != 0))
-        if (isSafeToExecuteUnconditionally(*Use)) {
-          GuaranteedToExecute = true;
-          Alignment = InstAlignment;
-        }
-
-      if (!GuaranteedToExecute)
-        GuaranteedToExecute = isSafeToExecuteUnconditionally(*Use);
-
       LoopUses.push_back(Use);
     }
   }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a0e41d9..ad15cbb 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -267,7 +267,7 @@ bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
 
 /// processLoopStore - See if this store can be promoted to a memset or memcpy.
 bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
-  if (SI->isVolatile()) return false;
+  if (!SI->isSimple()) return false;
 
   Value *StoredVal = SI->getValueOperand();
   Value *StorePtr = SI->getPointerOperand();
@@ -314,7 +314,7 @@ bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) {
     const SCEVAddRecExpr *LoadEv =
       dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0)));
     if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() &&
-        StoreEv->getOperand(1) == LoadEv->getOperand(1) && !LI->isVolatile())
+        StoreEv->getOperand(1) == LoadEv->getOperand(1) && LI->isSimple())
       if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount))
         return true;
   }
@@ -463,7 +463,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     SplatValue = 0;
   } else {
     // Otherwise, this isn't an idiom we can transform.  For example, we can't
-    // do anything with a 3-byte store, for example.
+    // do anything with a 3-byte store.
     return false;
   }
 
@@ -498,7 +498,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
@@ -604,7 +604,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  const Type *IntPtr = TD->getIntPtrType(SI->getContext());
+  Type *IntPtr = TD->getIntPtrType(SI->getContext());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 509d026..3e122c2 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -70,12 +70,27 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 using namespace llvm;
 
+namespace llvm {
+cl::opt<bool> EnableNested(
+  "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
+
+cl::opt<bool> EnableRetry(
+    "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+cl::opt<bool> EnablePhiElim(
+    "enable-lsr-phielim", cl::Hidden, cl::desc("Enable LSR phi elimination"));
+}
+
 namespace {
 
 /// RegSortData - This class holds data which is used to order reuse candidates.
@@ -219,7 +234,7 @@ struct Formula {
   void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
   unsigned getNumRegs() const;
-  const Type *getType() const;
+  Type *getType() const;
 
   void DeleteBaseReg(const SCEV *&S);
 
@@ -319,7 +334,7 @@ unsigned Formula::getNumRegs() const {
 
 /// getType - Return the type of this formula, if it has one, or null
 /// otherwise. This type is meaningless except for the bit size.
-const Type *Formula::getType() const {
+Type *Formula::getType() const {
   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
          ScaledReg ? ScaledReg->getType() :
          AM.BaseGV ? AM.BaseGV->getType() :
@@ -397,7 +412,7 @@ void Formula::dump() const {
 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 /// without changing its value.
 static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
-  const Type *WideTy =
+  Type *WideTy =
     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
   return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
 }
@@ -405,7 +420,7 @@ static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
 /// isAddSExtable - Return true if the given add can be sign-extended
 /// without changing its value.
 static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
-  const Type *WideTy =
+  Type *WideTy =
     IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
   return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
 }
@@ -413,7 +428,7 @@ static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
 /// isMulSExtable - Return true if the given mul can be sign-extended
 /// without changing its value.
 static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
-  const Type *WideTy =
+  Type *WideTy =
     IntegerType::get(SE.getContext(),
                      SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
   return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
@@ -594,8 +609,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
 }
 
 /// getAccessType - Return the type of the memory being accessed.
-static const Type *getAccessType(const Instruction *Inst) {
-  const Type *AccessTy = Inst->getType();
+static Type *getAccessType(const Instruction *Inst) {
+  Type *AccessTy = Inst->getType();
   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
     AccessTy = SI->getOperand(0)->getType();
   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
@@ -614,7 +629,7 @@ static const Type *getAccessType(const Instruction *Inst) {
 
   // All pointers have the same requirements, so canonicalize them to an
   // arbitrary pointer type to minimize variation.
-  if (const PointerType *PTy = dyn_cast<PointerType>(AccessTy))
+  if (PointerType *PTy = dyn_cast<PointerType>(AccessTy))
     AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
                                 PTy->getAddressSpace());
 
@@ -670,6 +685,21 @@ public:
 
   void Loose();
 
+#ifndef NDEBUG
+  // Once any of the metrics loses, they must all remain losers.
+  bool isValid() {
+    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+             | ImmCost | SetupCost) != ~0u)
+      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+           & ImmCost & SetupCost) == ~0u);
+  }
+#endif
+
+  bool isLoser() {
+    assert(isValid() && "invalid cost");
+    return NumRegs == ~0u;
+  }
+
   void RateFormula(const Formula &F,
                    SmallPtrSet<const SCEV *, 16> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
@@ -702,34 +732,48 @@ void Cost::RateRegister(const SCEV *Reg,
     if (AR->getLoop() == L)
       AddRecCost += 1; /// TODO: This should be a function of the stride.
 
-    // If this is an addrec for a loop that's already been visited by LSR,
-    // don't second-guess its addrec phi nodes. LSR isn't currently smart
-    // enough to reason about more than one loop at a time. Consider these
-    // registers free and leave them alone.
-    else if (L->contains(AR->getLoop()) ||
+    // If this is an addrec for another loop, don't second-guess its addrec phi
+    // nodes. LSR isn't currently smart enough to reason about more than one
+    // loop at a time. LSR has either already run on inner loops, will not run
+    // on other loops, and cannot be expected to change sibling loops. If the
+    // AddRec exists, consider it's register free and leave it alone. Otherwise,
+    // do not consider this formula at all.
+    // FIXME: why do we need to generate such fomulae?
+    else if (!EnableNested || L->contains(AR->getLoop()) ||
              (!AR->getLoop()->contains(L) &&
               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
       for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
-           PHINode *PN = dyn_cast<PHINode>(I); ++I)
+           PHINode *PN = dyn_cast<PHINode>(I); ++I) {
         if (SE.isSCEVable(PN->getType()) &&
             (SE.getEffectiveSCEVType(PN->getType()) ==
              SE.getEffectiveSCEVType(AR->getType())) &&
             SE.getSCEV(PN) == AR)
           return;
-
+      }
+      if (!EnableNested) {
+        Loose();
+        return;
+      }
       // If this isn't one of the addrecs that the loop already has, it
       // would require a costly new phi and add. TODO: This isn't
       // precisely modeled right now.
       ++NumBaseAdds;
-      if (!Regs.count(AR->getStart()))
+      if (!Regs.count(AR->getStart())) {
         RateRegister(AR->getStart(), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
     }
 
     // Add the step value register, if it needs one.
     // TODO: The non-affine case isn't precisely modeled here.
-    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
-      if (!Regs.count(AR->getStart()))
+    if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+      if (!Regs.count(AR->getOperand(1))) {
         RateRegister(AR->getOperand(1), Regs, L, SE, DT);
+        if (isLoser())
+          return;
+      }
+    }
   }
   ++NumRegs;
 
@@ -769,6 +813,8 @@ void Cost::RateFormula(const Formula &F,
       return;
     }
     RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
+    if (isLoser())
+      return;
   }
   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
        E = F.BaseRegs.end(); I != E; ++I) {
@@ -778,6 +824,8 @@ void Cost::RateFormula(const Formula &F,
       return;
     }
     RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
+    if (isLoser())
+      return;
   }
 
   // Determine how many (unfolded) adds we'll need inside the loop.
@@ -795,6 +843,7 @@ void Cost::RateFormula(const Formula &F,
     else if (Offset != 0)
       ImmCost += APInt(64, Offset, true).getMinSignedBits();
   }
+  assert(isValid() && "invalid cost");
 }
 
 /// Loose - Set this cost to a losing value.
@@ -980,7 +1029,7 @@ public:
   };
 
   KindType Kind;
-  const Type *AccessTy;
+  Type *AccessTy;
 
   SmallVector<int64_t, 8> Offsets;
   int64_t MinOffset;
@@ -995,7 +1044,7 @@ public:
   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
   /// max fixup widths to be equivalent, because the narrower one may be relying
   /// on the implicit truncation to truncate away bogus bits.
-  const Type *WidestFixupType;
+  Type *WidestFixupType;
 
   /// Formulae - A list of ways to build a value that can satisfy this user.
   /// After the list is populated, one of these is selected heuristically and
@@ -1005,7 +1054,7 @@ public:
   /// Regs - The set of register candidates used by all formulae in this LSRUse.
   SmallPtrSet<const SCEV *, 4> Regs;
 
-  LSRUse(KindType K, const Type *T) : Kind(K), AccessTy(T),
+  LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
                                       MinOffset(INT64_MAX),
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true),
@@ -1127,7 +1176,7 @@ void LSRUse::dump() const {
 /// be completely folded into the user instruction at isel time. This includes
 /// address-mode folding and special icmp tricks.
 static bool isLegalUse(const TargetLowering::AddrMode &AM,
-                       LSRUse::KindType Kind, const Type *AccessTy,
+                       LSRUse::KindType Kind, Type *AccessTy,
                        const TargetLowering *TLI) {
   switch (Kind) {
   case LSRUse::Address:
@@ -1156,7 +1205,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
     if (AM.BaseOffs != 0) {
-      if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
+      if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
       return false;
     }
 
@@ -1176,7 +1225,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
 
 static bool isLegalUse(TargetLowering::AddrMode AM,
                        int64_t MinOffset, int64_t MaxOffset,
-                       LSRUse::KindType Kind, const Type *AccessTy,
+                       LSRUse::KindType Kind, Type *AccessTy,
                        const TargetLowering *TLI) {
   // Check for overflow.
   if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
@@ -1198,7 +1247,7 @@ static bool isLegalUse(TargetLowering::AddrMode AM,
 static bool isAlwaysFoldable(int64_t BaseOffs,
                              GlobalValue *BaseGV,
                              bool HasBaseReg,
-                             LSRUse::KindType Kind, const Type *AccessTy,
+                             LSRUse::KindType Kind, Type *AccessTy,
                              const TargetLowering *TLI) {
   // Fast-path: zero is always foldable.
   if (BaseOffs == 0 && !BaseGV) return true;
@@ -1224,7 +1273,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
 static bool isAlwaysFoldable(const SCEV *S,
                              int64_t MinOffset, int64_t MaxOffset,
                              bool HasBaseReg,
-                             LSRUse::KindType Kind, const Type *AccessTy,
+                             LSRUse::KindType Kind, Type *AccessTy,
                              const TargetLowering *TLI,
                              ScalarEvolution &SE) {
   // Fast-path: zero is always foldable.
@@ -1299,7 +1348,7 @@ class LSRInstance {
   SmallSetVector<int64_t, 8> Factors;
 
   /// Types - Interesting use types, to facilitate truncation reuse.
-  SmallSetVector<const Type *, 4> Types;
+  SmallSetVector<Type *, 4> Types;
 
   /// Fixups - The list of operands which are to be replaced.
   SmallVector<LSRFixup, 16> Fixups;
@@ -1330,11 +1379,11 @@ class LSRInstance {
   UseMapTy UseMap;
 
   bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
-                          LSRUse::KindType Kind, const Type *AccessTy);
+                          LSRUse::KindType Kind, Type *AccessTy);
 
   std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
                                     LSRUse::KindType Kind,
-                                    const Type *AccessTy);
+                                    Type *AccessTy);
 
   void DeleteUse(LSRUse &LU, size_t LUIdx);
 
@@ -1426,7 +1475,8 @@ void LSRInstance::OptimizeShadowIV() {
     IVUsers::const_iterator CandidateUI = UI;
     ++UI;
     Instruction *ShadowUse = CandidateUI->getUser();
-    const Type *DestTy = NULL;
+    Type *DestTy = NULL;
+    bool IsSigned = false;
 
     /* If shadow use is a int->float cast then insert a second IV
        to eliminate this cast.
@@ -1440,10 +1490,14 @@ void LSRInstance::OptimizeShadowIV() {
          for (unsigned i = 0; i < n; ++i, ++d)
            foo(d);
     */
-    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
+    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = false;
       DestTy = UCast->getDestTy();
-    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
+    }
+    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+      IsSigned = true;
       DestTy = SCast->getDestTy();
+    }
     if (!DestTy) continue;
 
     if (TLI) {
@@ -1457,7 +1511,7 @@ void LSRInstance::OptimizeShadowIV() {
     if (!PH) continue;
     if (PH->getNumIncomingValues() != 2) continue;
 
-    const Type *SrcTy = PH->getType();
+    Type *SrcTy = PH->getType();
     int Mantissa = DestTy->getFPMantissaWidth();
     if (Mantissa == -1) continue;
     if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
@@ -1474,7 +1528,9 @@ void LSRInstance::OptimizeShadowIV() {
 
     ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
     if (!Init) continue;
-    Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
+    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+                                        (double)Init->getSExtValue() :
+                                        (double)Init->getZExtValue());
 
     BinaryOperator *Incr =
       dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
@@ -1776,7 +1832,7 @@ LSRInstance::OptimizeLoopTermCond() {
             if (!TLI)
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
-            const Type *AccessTy = getAccessType(UI->getUser());
+            Type *AccessTy = getAccessType(UI->getUser());
             TargetLowering::AddrMode AM;
             AM.Scale = C->getSExtValue();
             if (TLI->isLegalAddressingMode(AM, AccessTy))
@@ -1840,10 +1896,10 @@ LSRInstance::OptimizeLoopTermCond() {
 /// return true.
 bool
 LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
-                                LSRUse::KindType Kind, const Type *AccessTy) {
+                                LSRUse::KindType Kind, Type *AccessTy) {
   int64_t NewMinOffset = LU.MinOffset;
   int64_t NewMaxOffset = LU.MaxOffset;
-  const Type *NewAccessTy = AccessTy;
+  Type *NewAccessTy = AccessTy;
 
   // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
   // something conservative, however this can pessimize in the case that one of
@@ -1882,7 +1938,7 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
 /// Either reuse an existing use or create a new one, as needed.
 std::pair<size_t, int64_t>
 LSRInstance::getUse(const SCEV *&Expr,
-                    LSRUse::KindType Kind, const Type *AccessTy) {
+                    LSRUse::KindType Kind, Type *AccessTy) {
   const SCEV *Copy = Expr;
   int64_t Offset = ExtractImmediate(Expr, SE);
 
@@ -2044,7 +2100,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
     LF.PostIncLoops = UI->getPostIncLoops();
 
     LSRUse::KindType Kind = LSRUse::Basic;
-    const Type *AccessTy = 0;
+    Type *AccessTy = 0;
     if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
       Kind = LSRUse::Address;
       AccessTy = getAccessType(LF.UserInst);
@@ -2464,7 +2520,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
   if (LU.Kind != LSRUse::ICmpZero) return;
 
   // Determine the integer type for the base formula.
-  const Type *IntTy = Base.getType();
+  Type *IntTy = Base.getType();
   if (!IntTy) return;
   if (SE.getTypeSizeInBits(IntTy) > 64) return;
 
@@ -2538,7 +2594,7 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
 /// scaled-offset address modes, for example.
 void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
   // Determine the integer type for the base formula.
-  const Type *IntTy = Base.getType();
+  Type *IntTy = Base.getType();
   if (!IntTy) return;
 
   // If this Formula already has a scaled register, we can't add another one.
@@ -2598,13 +2654,13 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
   if (Base.AM.BaseGV) return;
 
   // Determine the integer type for the base formula.
-  const Type *DstTy = Base.getType();
+  Type *DstTy = Base.getType();
   if (!DstTy) return;
   DstTy = SE.getEffectiveSCEVType(DstTy);
 
-  for (SmallSetVector<const Type *, 4>::const_iterator
+  for (SmallSetVector<Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
-    const Type *SrcTy = *I;
+    Type *SrcTy = *I;
     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
       Formula F = Base;
 
@@ -2741,7 +2797,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
     int64_t Imm = WI.Imm;
     const SCEV *OrigReg = WI.OrigReg;
 
-    const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+    Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
     unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
 
@@ -3275,6 +3331,9 @@ retry:
   skip:;
   }
 
+  if (!EnableRetry && !AnySatisfiedReqRegs)
+    return;
+
   // If none of the formulae had all of the required registers, relax the
   // constraint so that we don't exclude all formulae.
   if (!AnySatisfiedReqRegs) {
@@ -3298,6 +3357,10 @@ void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
   // SolveRecurse does all the work.
   SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
                CurRegs, VisitedRegs);
+  if (Solution.empty()) {
+    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+    return;
+  }
 
   // Ok, we've now made all our decisions.
   DEBUG(dbgs() << "\n"
@@ -3416,6 +3479,9 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
   // Don't insert instructions before PHI nodes.
   while (isa<PHINode>(IP)) ++IP;
 
+  // Ignore landingpad instructions.
+  while (isa<LandingPadInst>(IP)) ++IP;
+
   // Ignore debug intrinsics.
   while (isa<DbgInfoIntrinsic>(IP)) ++IP;
 
@@ -3440,9 +3506,9 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
   Rewriter.setPostInc(LF.PostIncLoops);
 
   // This is the type that the user actually needs.
-  const Type *OpTy = LF.OperandValToReplace->getType();
+  Type *OpTy = LF.OperandValToReplace->getType();
   // This will be the type that we'll initially expand to.
-  const Type *Ty = F.getType();
+  Type *Ty = F.getType();
   if (!Ty)
     // No type known; just expand directly to the ultimate type.
     Ty = OpTy;
@@ -3450,7 +3516,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
     // Expand directly to the ultimate type if it's the right size.
     Ty = OpTy;
   // This is the type to do integer arithmetic in.
-  const Type *IntTy = SE.getEffectiveSCEVType(Ty);
+  Type *IntTy = SE.getEffectiveSCEVType(Ty);
 
   // Build up a list of operands to add together to form the full base.
   SmallVector<const SCEV *, 8> Ops;
@@ -3527,7 +3593,7 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
       // The other interesting way of "folding" with an ICmpZero is to use a
       // negated immediate.
       if (!ICmpScaledV)
-        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
+        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset);
@@ -3611,10 +3677,20 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
       // users.
       if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
           !isa<IndirectBrInst>(BB->getTerminator())) {
-        Loop *PNLoop = LI.getLoopFor(PN->getParent());
-        if (!PNLoop || PN->getParent() != PNLoop->getHeader()) {
+        BasicBlock *Parent = PN->getParent();
+        Loop *PNLoop = LI.getLoopFor(Parent);
+        if (!PNLoop || Parent != PNLoop->getHeader()) {
           // Split the critical edge.
-          BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
+          BasicBlock *NewBB = 0;
+          if (!Parent->isLandingPad()) {
+            NewBB = SplitCriticalEdge(BB, Parent, P,
+                                      /*MergeIdenticalEdges=*/true,
+                                      /*DontDeleteUselessPhis=*/true);
+          } else {
+            SmallVector<BasicBlock*, 2> NewBBs;
+            SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
+            NewBB = NewBBs[0];
+          }
 
           // If PN is outside of the loop and BB is in the loop, we want to
           // move the block to be immediately before the PHI block, not
@@ -3637,7 +3713,7 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
         Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);
 
         // If this is reuse-by-noop-cast, insert the noop cast.
-        const Type *OpTy = LF.OperandValToReplace->getType();
+        Type *OpTy = LF.OperandValToReplace->getType();
         if (FullV->getType() != OpTy)
           FullV =
             CastInst::Create(CastInst::getCastOpcode(FullV, false,
@@ -3667,7 +3743,7 @@ void LSRInstance::Rewrite(const LSRFixup &LF,
     Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);
 
     // If this is reuse-by-noop-cast, insert the noop cast.
-    const Type *OpTy = LF.OperandValToReplace->getType();
+    Type *OpTy = LF.OperandValToReplace->getType();
     if (FullV->getType() != OpTy) {
       Instruction *Cast =
         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
@@ -3700,6 +3776,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
 
   SCEVExpander Rewriter(SE, "lsr");
   Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
 
   // Expand the new value definitions and update the users.
@@ -3740,6 +3817,23 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   OptimizeShadowIV();
   OptimizeLoopTermCond();
 
+  // If loop preparation eliminates all interesting IV users, bail.
+  if (IU.empty()) return;
+
+  // Skip nested loops until we can model them better with formulae.
+  if (!EnableNested && !L->empty()) {
+
+    if (EnablePhiElim) {
+      // Remove any extra phis created by processing inner loops.
+      SmallVector<WeakVH, 16> DeadInsts;
+      SCEVExpander Rewriter(SE, "lsr");
+      Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+      Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+    }
+    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+    return;
+  }
+
   // Start collecting data and preparing for the solver.
   CollectInterestingTypesAndFactors();
   CollectFixupsAndInitialFormulae();
@@ -3763,6 +3857,9 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   Types.clear();
   RegUses.clear();
 
+  if (Solution.empty())
+    return;
+
 #ifndef NDEBUG
   // Formulae should be legal.
   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
@@ -3778,6 +3875,14 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
 
   // Now that we've decided what we want, make it so.
   ImplementSolution(Solution, P);
+
+  if (EnablePhiElim) {
+    // Remove any extra phis created by processing inner loops.
+    SmallVector<WeakVH, 16> DeadInsts;
+    SCEVExpander Rewriter(SE, "lsr");
+    Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+    Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+  }
 }
 
 void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
@@ -3793,7 +3898,7 @@ void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
     OS << '*' << *I;
   }
 
-  for (SmallSetVector<const Type *, 4>::const_iterator
+  for (SmallSetVector<Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
     if (!First) OS << ", ";
     First = false;
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index fef6bc3..91395b2 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Target/TargetData.h"
 #include <climits>
 
 using namespace llvm;
@@ -39,6 +40,11 @@ UnrollAllowPartial("unroll-allow-partial", cl::init(false), cl::Hidden,
   cl::desc("Allows loops to be partially unrolled until "
            "-unroll-threshold loop size is reached."));
 
+// Temporary flag to be removed in 3.0
+static cl::opt<bool>
+NoSCEVUnroll("disable-unroll-scev", cl::init(false), cl::Hidden,
+  cl::desc("Use ScalarEvolution to analyze loop trip counts for unrolling"));
+
 namespace {
   class LoopUnroll : public LoopPass {
   public:
@@ -49,7 +55,7 @@ namespace {
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
-     
+
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
 
@@ -57,11 +63,11 @@ namespace {
     /// that the loop unroll should be performed regardless of how much
     /// code expansion would result.
     static const unsigned NoThreshold = UINT_MAX;
-    
+
     // Threshold to use when optsize is specified (and there is no
     // explicit -unroll-threshold).
     static const unsigned OptSizeUnrollThreshold = 50;
-    
+
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
@@ -79,6 +85,7 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addRequiredID(LCSSAID);
       AU.addPreservedID(LCSSAID);
+      AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
       // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
       // If loop unroll does not preserve dom info then LCSSA pass on next
@@ -101,45 +108,62 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) {
+static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
+                                    const TargetData *TD) {
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
-    Metrics.analyzeBasicBlock(*I);
+    Metrics.analyzeBasicBlock(*I, TD);
   NumCalls = Metrics.NumInlineCandidates;
-  
+
   unsigned LoopSize = Metrics.NumInsts;
-  
+
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
   // not a problem for code quality.
   if (LoopSize == 0) LoopSize = 1;
-  
+
   return LoopSize;
 }
 
 bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   LoopInfo *LI = &getAnalysis<LoopInfo>();
+  ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
 
   BasicBlock *Header = L->getHeader();
   DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName()
         << "] Loop %" << Header->getName() << "\n");
   (void)Header;
-  
+
   // Determine the current unrolling threshold.  While this is normally set
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
   unsigned Threshold = CurrentThreshold;
-  if (!UserThreshold && 
+  if (!UserThreshold &&
       Header->getParent()->hasFnAttr(Attribute::OptimizeForSize))
     Threshold = OptSizeUnrollThreshold;
 
-  // Find trip count
-  unsigned TripCount = L->getSmallConstantTripCount();
-  unsigned Count = CurrentCount;
-
+  // Find trip count and trip multiple if count is not available
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  if (!NoSCEVUnroll) {
+    // Find "latch trip count". UnrollLoop assumes that control cannot exit
+    // via the loop latch on any iteration prior to TripCount. The loop may exit
+    // early via an earlier branch.
+    BasicBlock *LatchBlock = L->getLoopLatch();
+    if (LatchBlock) {
+      TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
+      TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
+    }
+  }
+  else {
+    TripCount = L->getSmallConstantTripCount();
+    if (TripCount == 0)
+      TripMultiple = L->getSmallConstantTripMultiple();
+  }
   // Automatically select an unroll count.
+  unsigned Count = CurrentCount;
   if (Count == 0) {
     // Conservative heuristic: if we know the trip count, see if we can
     // completely unroll (subject to the threshold, checked below); otherwise
@@ -152,8 +176,9 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Enforce the threshold.
   if (Threshold != NoThreshold) {
+    const TargetData *TD = getAnalysisIfAvailable<TargetData>();
     unsigned NumInlineCandidates;
-    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates);
+    unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
     if (NumInlineCandidates != 0) {
       DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
@@ -182,12 +207,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  Function *F = L->getHeader()->getParent();
-  if (!UnrollLoop(L, Count, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, TripMultiple, LI, &LPM))
     return false;
 
-  // FIXME: Reconstruct dom info, because it is not preserved properly.
-  if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
-    DT->runOnFunction(*F);
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 840c4b6..458949c 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -492,7 +492,7 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
   Value *BranchVal = LIC;
   if (!isa<ConstantInt>(Val) ||
       Val->getType() != Type::getInt1Ty(LIC->getContext()))
-    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val, "tmp");
+    BranchVal = new ICmpInst(InsertPt, ICmpInst::ICMP_EQ, LIC, Val);
   else if (Val != ConstantInt::getTrue(Val->getContext()))
     // We want to enter the new loop when the condition is true.
     std::swap(TrueDest, FalseDest);
@@ -561,10 +561,17 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
     BasicBlock *ExitBlock = ExitBlocks[i];
     SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
                                        pred_end(ExitBlock));
+
     // Although SplitBlockPredecessors doesn't preserve loop-simplify in
     // general, if we call it on all predecessors of all exits then it does.
-    SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
-                           ".us-lcssa", this);
+    if (!ExitBlock->isLandingPad()) {
+      SplitBlockPredecessors(ExitBlock, Preds.data(), Preds.size(),
+                             ".us-lcssa", this);
+    } else {
+      SmallVector<BasicBlock*, 2> NewBBs;
+      SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
+                                  this, NewBBs);
+    }
   }
 }
 
@@ -632,7 +639,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     // as well.
     ParentLoop->addBasicBlockToLoop(NewBlocks[0], LI->getBase());
   }
-  
+
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
     BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
     // The new exit block should be in the same loop as the old one.
@@ -653,6 +660,19 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
       if (It != VMap.end()) V = It->second;
       PN->addIncoming(V, NewExit);
     }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PN = PHINode::Create(LPad->getType(), 0, "",
+                           ExitSucc->getFirstInsertionPt());
+
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
   }
 
   // Rewrite the code to refer to itself.
diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp
index 9087b46..689bbe9 100644
--- a/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -20,98 +20,88 @@
 #include "llvm/Support/IRBuilder.h"
 using namespace llvm;
 
-static bool LowerAtomicIntrinsic(IntrinsicInst *II) {
-  IRBuilder<> Builder(II->getParent(), II);
-  unsigned IID = II->getIntrinsicID();
-  switch (IID) {
-  case Intrinsic::memory_barrier:
-    break;
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+  IRBuilder<> Builder(CXI->getParent(), CXI);
+  Value *Ptr = CXI->getPointerOperand();
+  Value *Cmp = CXI->getCompareOperand();
+  Value *Val = CXI->getNewValOperand();
+ 
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+  Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+  Builder.CreateStore(Res, Ptr);
+ 
+  CXI->replaceAllUsesWith(Orig);
+  CXI->eraseFromParent();
+  return true;
+}
 
-  case Intrinsic::atomic_load_add:
-  case Intrinsic::atomic_load_sub:
-  case Intrinsic::atomic_load_and:
-  case Intrinsic::atomic_load_nand:
-  case Intrinsic::atomic_load_or:
-  case Intrinsic::atomic_load_xor:
-  case Intrinsic::atomic_load_max:
-  case Intrinsic::atomic_load_min:
-  case Intrinsic::atomic_load_umax:
-  case Intrinsic::atomic_load_umin: {
-    Value *Ptr = II->getArgOperand(0), *Delta = II->getArgOperand(1);
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+  IRBuilder<> Builder(RMWI->getParent(), RMWI);
+  Value *Ptr = RMWI->getPointerOperand();
+  Value *Val = RMWI->getValOperand();
 
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Value *Res = NULL;
-    switch (IID) {
-    default: assert(0 && "Unrecognized atomic modify operation");
-    case Intrinsic::atomic_load_add:
-      Res = Builder.CreateAdd(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_sub:
-      Res = Builder.CreateSub(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_and:
-      Res = Builder.CreateAnd(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_nand:
-      Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta));
-      break;
-    case Intrinsic::atomic_load_or:
-      Res = Builder.CreateOr(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_xor:
-      Res = Builder.CreateXor(Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_max:
-      Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
-                                 Delta, Orig);
-      break;
-    case Intrinsic::atomic_load_min:
-      Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta),
-                                 Orig, Delta);
-      break;
-    case Intrinsic::atomic_load_umax:
-      Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
-                                 Delta, Orig);
-      break;
-    case Intrinsic::atomic_load_umin:
-      Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta),
-                                 Orig, Delta);
-      break;
-    }
-    Builder.CreateStore(Res, Ptr);
+  LoadInst *Orig = Builder.CreateLoad(Ptr);
+  Value *Res = NULL;
 
-    II->replaceAllUsesWith(Orig);
+  switch (RMWI->getOperation()) {
+  default: llvm_unreachable("Unexpected RMW operation");
+  case AtomicRMWInst::Xchg:
+    Res = Val;
     break;
-  }
-
-  case Intrinsic::atomic_swap: {
-    Value *Ptr = II->getArgOperand(0), *Val = II->getArgOperand(1);
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Builder.CreateStore(Val, Ptr);
-    II->replaceAllUsesWith(Orig);
+  case AtomicRMWInst::Add:
+    Res = Builder.CreateAdd(Orig, Val);
     break;
-  }
-
-  case Intrinsic::atomic_cmp_swap: {
-    Value *Ptr = II->getArgOperand(0), *Cmp = II->getArgOperand(1);
-    Value *Val = II->getArgOperand(2);
-
-    LoadInst *Orig = Builder.CreateLoad(Ptr);
-    Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
-    Value *Res = Builder.CreateSelect(Equal, Val, Orig);
-    Builder.CreateStore(Res, Ptr);
-    II->replaceAllUsesWith(Orig);
+  case AtomicRMWInst::Sub:
+    Res = Builder.CreateSub(Orig, Val);
+    break;
+  case AtomicRMWInst::And:
+    Res = Builder.CreateAnd(Orig, Val);
+    break;
+  case AtomicRMWInst::Nand:
+    Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+    break;
+  case AtomicRMWInst::Or:
+    Res = Builder.CreateOr(Orig, Val);
+    break;
+  case AtomicRMWInst::Xor:
+    Res = Builder.CreateXor(Orig, Val);
+    break;
+  case AtomicRMWInst::Max:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::Min:
+    Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+                               Orig, Val);
+    break;
+  case AtomicRMWInst::UMax:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Val, Orig);
+    break;
+  case AtomicRMWInst::UMin:
+    Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+                               Orig, Val);
     break;
   }
+  Builder.CreateStore(Res, Ptr);
+  RMWI->replaceAllUsesWith(Orig);
+  RMWI->eraseFromParent();
+  return true;
+}
 
-  default:
-    return false;
-  }
+static bool LowerFenceInst(FenceInst *FI) {
+  FI->eraseFromParent();
+  return true;
+}
 
-  assert(II->use_empty() &&
-         "Lowering should have eliminated any uses of the intrinsic call!");
-  II->eraseFromParent();
+static bool LowerLoadInst(LoadInst *LI) {
+  LI->setAtomic(NotAtomic);
+  return true;
+}
 
+static bool LowerStoreInst(StoreInst *SI) {
+  SI->setAtomic(NotAtomic);
   return true;
 }
 
@@ -123,9 +113,22 @@ namespace {
     }
     bool runOnBasicBlock(BasicBlock &BB) {
       bool Changed = false;
-      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; )
-        if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI++))
-          Changed |= LowerAtomicIntrinsic(II);
+      for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) {
+        Instruction *Inst = DI++;
+        if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+          Changed |= LowerFenceInst(FI);
+        else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+          Changed |= LowerAtomicCmpXchgInst(CXI);
+        else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+          Changed |= LowerAtomicRMWInst(RMWI);
+        else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+          if (LI->isAtomic())
+            LowerLoadInst(LI);
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+          if (SI->isAtomic())
+            LowerStoreInst(SI);
+        }
+      }
       return Changed;
     }
   };
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 7ed3db6..eeb8931 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -54,7 +54,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
     if (OpC->isZero()) continue;  // No offset.
 
     // Handle struct indices, which add their field offset to the pointer.
-    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
       Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
       continue;
     }
@@ -384,7 +384,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     
     if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
-      if (NextStore->isVolatile()) break;
+      if (!NextStore->isSimple()) break;
     
       // Check to see if this stored value is of the same byte-splattable value.
       if (ByteVal != isBytewiseValue(NextStore->getOperand(0)))
@@ -448,7 +448,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
     // Determine alignment
     unsigned Alignment = Range.Alignment;
     if (Alignment == 0) {
-      const Type *EltType = 
+      Type *EltType = 
         cast<PointerType>(StartPtr->getType())->getElementType();
       Alignment = TD->getABITypeAlignment(EltType);
     }
@@ -479,7 +479,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
 
 
 bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
-  if (SI->isVolatile()) return false;
+  if (!SI->isSimple()) return false;
   
   if (TD == 0) return false;
 
@@ -487,7 +487,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   // happen to be using a load-store pair to implement it, rather than
   // a memcpy.
   if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) {
-    if (!LI->isVolatile() && LI->hasOneUse() &&
+    if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
       MemDepResult ldep = MD->getDependency(LI);
       CallInst *C = 0;
@@ -616,7 +616,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     if (!A->hasStructRetAttr())
       return false;
 
-    const Type *StructTy = cast<PointerType>(A->getType())->getElementType();
+    Type *StructTy = cast<PointerType>(A->getType())->getElementType();
     uint64_t destSize = TD->getTypeAllocSize(StructTy);
 
     if (destSize < srcSize)
@@ -860,7 +860,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) {
 
   // Find out what feeds this byval argument.
   Value *ByValArg = CS.getArgument(ArgNo);
-  const Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType();
+  Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = TD->getTypeAllocSize(ByValTy);
   MemDepResult DepInfo =
     MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize),
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index ee132d3..da74e9c 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -180,7 +180,7 @@ static bool IsPotentialUse(const Value *Op) {
         Arg->hasStructRetAttr())
       return false;
   // Only consider values with pointer types, and not function pointers.
-  const PointerType *Ty = dyn_cast<PointerType>(Op->getType());
+  PointerType *Ty = dyn_cast<PointerType>(Op->getType());
   if (!Ty || isa<FunctionType>(Ty->getElementType()))
     return false;
   // Conservatively assume anything else is a potential use.
@@ -213,8 +213,8 @@ static InstructionClass GetFunctionClass(const Function *F) {
   const Argument *A0 = AI++;
   if (AI == AE)
     // Argument is a pointer.
-    if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
-      const Type *ETy = PTy->getElementType();
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType())) {
+      Type *ETy = PTy->getElementType();
       // Argument is i8*.
       if (ETy->isIntegerTy(8))
         return StringSwitch<InstructionClass>(F->getName())
@@ -234,7 +234,7 @@ static InstructionClass GetFunctionClass(const Function *F) {
           .Default(IC_CallOrUser);
 
       // Argument is i8**
-      if (const PointerType *Pte = dyn_cast<PointerType>(ETy))
+      if (PointerType *Pte = dyn_cast<PointerType>(ETy))
         if (Pte->getElementType()->isIntegerTy(8))
           return StringSwitch<InstructionClass>(F->getName())
             .Case("objc_loadWeakRetained",      IC_LoadWeakRetained)
@@ -246,11 +246,11 @@ static InstructionClass GetFunctionClass(const Function *F) {
   // Two arguments, first is i8**.
   const Argument *A1 = AI++;
   if (AI == AE)
-    if (const PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
-      if (const PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
+    if (PointerType *PTy = dyn_cast<PointerType>(A0->getType()))
+      if (PointerType *Pte = dyn_cast<PointerType>(PTy->getElementType()))
         if (Pte->getElementType()->isIntegerTy(8))
-          if (const PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
-            const Type *ETy1 = PTy1->getElementType();
+          if (PointerType *PTy1 = dyn_cast<PointerType>(A1->getType())) {
+            Type *ETy1 = PTy1->getElementType();
             // Second argument is i8*
             if (ETy1->isIntegerTy(8))
               return StringSwitch<InstructionClass>(F->getName())
@@ -258,7 +258,7 @@ static InstructionClass GetFunctionClass(const Function *F) {
                      .Case("objc_initWeak",              IC_InitWeak)
                      .Default(IC_CallOrUser);
             // Second argument is i8**.
-            if (const PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
+            if (PointerType *Pte1 = dyn_cast<PointerType>(ETy1))
               if (Pte1->getElementType()->isIntegerTy(8))
                 return StringSwitch<InstructionClass>(F->getName())
                        .Case("objc_moveWeak",              IC_MoveWeak)
@@ -344,6 +344,10 @@ static InstructionClass GetInstructionClass(const Value *V) {
       break;
     default:
       // For anything else, check all the operands.
+      // Note that this includes both operands of a Store: while the first
+      // operand isn't actually being dereferenced, it is being stored to
+      // memory where we can no longer track who might read it and dereference
+      // it, so we have to consider it potentially used.
       for (User::const_op_iterator OI = I->op_begin(), OE = I->op_end();
            OI != OE; ++OI)
         if (IsPotentialUse(*OI))
@@ -421,9 +425,10 @@ static bool IsAlwaysTail(InstructionClass Class) {
 /// IsNoThrow - Test if the given class represents instructions which are always
 /// safe to mark with the nounwind attribute..
 static bool IsNoThrow(InstructionClass Class) {
+  // objc_retainBlock is not nounwind because it calls user copy constructors
+  // which could theoretically throw.
   return Class == IC_Retain ||
          Class == IC_RetainRV ||
-         Class == IC_RetainBlock ||
          Class == IC_Release ||
          Class == IC_Autorelease ||
          Class == IC_AutoreleaseRV ||
@@ -515,6 +520,10 @@ static bool IsObjCIdentifiedObject(const Value *V) {
     const Value *Pointer =
       StripPointerCastsAndObjCCalls(LI->getPointerOperand());
     if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Pointer)) {
+      // A constant pointer can't be pointing to an object on the heap. It may
+      // be reference-counted, but it won't be deleted.
+      if (GV->isConstant())
+        return true;
       StringRef Name = GV->getName();
       // These special variables are known to hold values which are not
       // reference-counted pointers.
@@ -738,7 +747,6 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   switch (GetBasicInstructionClass(CS.getInstruction())) {
   case IC_Retain:
   case IC_RetainRV:
-  case IC_RetainBlock:
   case IC_Autorelease:
   case IC_AutoreleaseRV:
   case IC_NoopCast:
@@ -746,6 +754,8 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) {
   case IC_FusedRetainAutorelease:
   case IC_FusedRetainAutoreleaseRV:
     // These functions don't access any memory visible to the compiler.
+    // Note that this doesn't include objc_retainBlock, becuase it updates
+    // pointers when it copies block data.
     return NoModRef;
   default:
     break;
@@ -877,7 +887,9 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 // usually can't sink them past other calls, which would be the main
 // case where it would be useful.
 
-/// TODO: The pointer returned from objc_loadWeakRetained is retained.
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
 
 #include "llvm/GlobalAlias.h"
 #include "llvm/Constants.h"
@@ -1098,16 +1110,16 @@ static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
   if (A == S_None || B == S_None)
     return S_None;
 
-  // Note that we can't merge S_CanRelease and S_Use.
   if (A > B) std::swap(A, B);
   if (TopDown) {
     // Choose the side which is further along in the sequence.
-    if (A == S_Retain && (B == S_CanRelease || B == S_Use))
+    if ((A == S_Retain || A == S_CanRelease) &&
+        (B == S_CanRelease || B == S_Use))
       return B;
   } else {
     // Choose the side which is further along in the sequence.
     if ((A == S_Use || A == S_CanRelease) &&
-        (B == S_Release || B == S_Stop || B == S_MovableRelease))
+        (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
       return A;
     // If both sides are releases, choose the more conservative one.
     if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
@@ -1124,13 +1136,19 @@ namespace {
   /// retain-decrement-use-release sequence or release-use-decrement-retain
   /// reverese sequence.
   struct RRInfo {
-    /// KnownIncremented - After an objc_retain, the reference count of the
-    /// referenced object is known to be positive. Similarly, before an
-    /// objc_release, the reference count of the referenced object is known to
-    /// be positive. If there are retain-release pairs in code regions where the
-    /// retain count is known to be positive, they can be eliminated, regardless
-    /// of any side effects between them.
-    bool KnownIncremented;
+    /// KnownSafe - After an objc_retain, the reference count of the referenced
+    /// object is known to be positive. Similarly, before an objc_release, the
+    /// reference count of the referenced object is known to be positive. If
+    /// there are retain-release pairs in code regions where the retain count
+    /// is known to be positive, they can be eliminated, regardless of any side
+    /// effects between them.
+    ///
+    /// Also, a retain+release pair nested within another retain+release
+    /// pair all on the known same pointer value can be eliminated, regardless
+    /// of any intervening side effects.
+    ///
+    /// KnownSafe is true when either of these conditions is satisfied.
+    bool KnownSafe;
 
     /// IsRetainBlock - True if the Calls are objc_retainBlock calls (as
     /// opposed to objc_retain calls).
@@ -1153,7 +1171,7 @@ namespace {
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
     RRInfo() :
-      KnownIncremented(false), IsRetainBlock(false), IsTailCallRelease(false),
+      KnownSafe(false), IsRetainBlock(false), IsTailCallRelease(false),
       ReleaseMetadata(0) {}
 
     void clear();
@@ -1161,7 +1179,7 @@ namespace {
 }
 
 void RRInfo::clear() {
-  KnownIncremented = false;
+  KnownSafe = false;
   IsRetainBlock = false;
   IsTailCallRelease = false;
   ReleaseMetadata = 0;
@@ -1176,6 +1194,9 @@ namespace {
     /// RefCount - The known minimum number of reference count increments.
     unsigned RefCount;
 
+    /// NestCount - The known minimum level of retain+release nesting.
+    unsigned NestCount;
+
     /// Seq - The current position in the sequence.
     Sequence Seq;
 
@@ -1184,7 +1205,11 @@ namespace {
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : RefCount(0), Seq(S_None) {}
+    PtrState() : RefCount(0), NestCount(0), Seq(S_None) {}
+
+    void SetAtLeastOneRefCount()  {
+      if (RefCount == 0) RefCount = 1;
+    }
 
     void IncrementRefCount() {
       if (RefCount != UINT_MAX) ++RefCount;
@@ -1194,14 +1219,22 @@ namespace {
       if (RefCount != 0) --RefCount;
     }
 
-    void ClearRefCount() {
-      RefCount = 0;
-    }
-
     bool IsKnownIncremented() const {
       return RefCount > 0;
     }
 
+    void IncrementNestCount() {
+      if (NestCount != UINT_MAX) ++NestCount;
+    }
+
+    void DecrementNestCount() {
+      if (NestCount != 0) --NestCount;
+    }
+
+    bool IsKnownNested() const {
+      return NestCount > 0;
+    }
+
     void SetSeq(Sequence NewSeq) {
       Seq = NewSeq;
     }
@@ -1233,6 +1266,7 @@ void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   RefCount = std::min(RefCount, Other.RefCount);
+  NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
@@ -1245,7 +1279,7 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
     if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
       RRI.ReleaseMetadata = 0;
 
-    RRI.KnownIncremented = RRI.KnownIncremented && Other.RRI.KnownIncremented;
+    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
     RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease;
     RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
     RRI.ReverseInsertPts.insert(Other.RRI.ReverseInsertPts.begin(),
@@ -1316,7 +1350,7 @@ namespace {
     }
 
     void clearBottomUpPointers() {
-      PerPtrTopDown.clear();
+      PerPtrBottomUp.clear();
     }
 
     void clearTopDownPointers() {
@@ -1334,6 +1368,12 @@ namespace {
     unsigned GetAllPathCount() const {
       return TopDownPathCount * BottomUpPathCount;
     }
+
+    /// IsVisitedTopDown - Test whether the block for this BBState has been
+    /// visited by the top-down portion of the algorithm.
+    bool isVisitedTopDown() const {
+      return TopDownPathCount != 0;
+    }
   };
 }
 
@@ -1364,7 +1404,7 @@ void BBState::MergePred(const BBState &Other) {
                              /*TopDown=*/true);
   }
 
-  // For each entry in our set, if the other set doens't have an entry with the
+  // For each entry in our set, if the other set doesn't have an entry with the
   // same key, force it to merge with an empty entry.
   for (ptr_iterator MI = top_down_ptr_begin(),
        ME = top_down_ptr_end(); MI != ME; ++MI)
@@ -1389,7 +1429,7 @@ void BBState::MergeSucc(const BBState &Other) {
                              /*TopDown=*/false);
   }
 
-  // For each entry in our set, if the other set doens't have an entry
+  // For each entry in our set, if the other set doesn't have an entry
   // with the same key, force it to merge with an empty entry.
   for (ptr_iterator MI = bottom_up_ptr_begin(),
        ME = bottom_up_ptr_end(); MI != ME; ++MI)
@@ -1406,15 +1446,11 @@ namespace {
     /// Run - A flag indicating whether this optimization pass should run.
     bool Run;
 
-    /// RetainFunc, RelaseFunc - Declarations for objc_retain,
-    /// objc_retainBlock, and objc_release.
-    Function *RetainFunc, *RetainBlockFunc, *RetainRVFunc, *ReleaseFunc;
-
     /// RetainRVCallee, etc. - Declarations for ObjC runtime
     /// functions, for use in creating calls to them. These are initialized
     /// lazily to avoid cluttering up the Module with unused declarations.
     Constant *RetainRVCallee, *AutoreleaseRVCallee, *ReleaseCallee,
-             *RetainCallee, *AutoreleaseCallee;
+             *RetainCallee, *RetainBlockCallee, *AutoreleaseCallee;
 
     /// UsedInThisFunciton - Flags which determine whether each of the
     /// interesting runtine functions is in fact used in the current function.
@@ -1428,6 +1464,7 @@ namespace {
     Constant *getAutoreleaseRVCallee(Module *M);
     Constant *getReleaseCallee(Module *M);
     Constant *getRetainCallee(Module *M);
+    Constant *getRetainBlockCallee(Module *M);
     Constant *getAutoreleaseCallee(Module *M);
 
     void OptimizeRetainCall(Function &F, Instruction *Retain);
@@ -1452,11 +1489,13 @@ namespace {
     void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                    MapVector<Value *, RRInfo> &Retains,
                    DenseMap<Value *, RRInfo> &Releases,
-                   SmallVectorImpl<Instruction *> &DeadInsts);
+                   SmallVectorImpl<Instruction *> &DeadInsts,
+                   Module *M);
 
     bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
                               MapVector<Value *, RRInfo> &Retains,
-                              DenseMap<Value *, RRInfo> &Releases);
+                              DenseMap<Value *, RRInfo> &Releases,
+                              Module *M);
 
     void OptimizeWeakCalls(Function &F);
 
@@ -1501,7 +1540,7 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -1518,7 +1557,7 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -1561,6 +1600,23 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) {
   return RetainCallee;
 }
 
+Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
+  if (!RetainBlockCallee) {
+    LLVMContext &C = M->getContext();
+    std::vector<Type *> Params;
+    Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C)));
+    AttrListPtr Attributes;
+    // objc_retainBlock is not nounwind because it calls user copy constructors
+    // which could theoretically throw.
+    RetainBlockCallee =
+      M->getOrInsertFunction(
+        "objc_retainBlock",
+        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
+        Attributes);
+  }
+  return RetainBlockCallee;
+}
+
 Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   if (!AutoreleaseCallee) {
     LLVMContext &C = M->getContext();
@@ -1904,12 +1960,19 @@ void
 ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV) {
   // Check for a return of the pointer value.
   const Value *Ptr = GetObjCArg(AutoreleaseRV);
-  for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
-       UI != UE; ++UI) {
-    const User *I = *UI;
-    if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
-      return;
-  }
+  SmallVector<const Value *, 2> Users;
+  Users.push_back(Ptr);
+  do {
+    Ptr = Users.pop_back_val();
+    for (Value::const_use_iterator UI = Ptr->use_begin(), UE = Ptr->use_end();
+         UI != UE; ++UI) {
+      const User *I = *UI;
+      if (isa<ReturnInst>(I) || GetBasicInstructionClass(I) == IC_RetainRV)
+        return;
+      if (isa<BitCastInst>(I))
+        Users.push_back(I);
+    }
+  } while (!Users.empty());
 
   Changed = true;
   ++NumPeeps;
@@ -1953,7 +2016,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_DestroyWeak: {
       CallInst *CI = cast<CallInst>(Inst);
       if (isNullOrUndef(CI->getArgOperand(0))) {
-        const Type *Ty = CI->getArgOperand(0)->getType();
+        Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
                       CI);
@@ -1968,7 +2031,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       CallInst *CI = cast<CallInst>(Inst);
       if (isNullOrUndef(CI->getArgOperand(0)) ||
           isNullOrUndef(CI->getArgOperand(1))) {
-        const Type *Ty = CI->getArgOperand(0)->getType();
+        Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
                       CI);
@@ -2090,7 +2153,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           ++NumPartialNoops;
           // Clone the call into each predecessor that has a non-null value.
           CallInst *CInst = cast<CallInst>(Inst);
-          const Type *ParamTy = CInst->getArgOperand(0)->getType();
+          Type *ParamTy = CInst->getArgOperand(0)->getType();
           for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
             Value *Incoming =
               StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
@@ -2132,41 +2195,49 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
-        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
+        switch (SuccS.GetSeq()) {
         case S_None:
-        case S_CanRelease:
-          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
-          SomeSuccHasSame = false;
-          break;
+        case S_CanRelease: {
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            S.ClearSequenceProgress();
+          continue;
+        }
         case S_Use:
           SomeSuccHasSame = true;
           break;
         case S_Stop:
         case S_Release:
         case S_MovableRelease:
-          AllSuccsHaveSame = false;
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            AllSuccsHaveSame = false;
           break;
         case S_Retain:
           llvm_unreachable("bottom-up pointer in retain state!");
         }
+      }
       // If the state at the other end of any of the successor edges
       // matches the current state, require all edges to match. This
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
-        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+        S.ClearSequenceProgress();
     }
     case S_CanRelease: {
       const Value *Arg = I->first;
       const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
       bool SomeSuccHasSame = false;
       bool AllSuccsHaveSame = true;
-      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI)
-        switch (BBStates[*SI].getPtrBottomUpState(Arg).GetSeq()) {
-        case S_None:
-          MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
-          SomeSuccHasSame = false;
-          break;
+      PtrState &S = MyStates.getPtrTopDownState(Arg);
+      for (succ_const_iterator SI(TI), SE(TI, false); SI != SE; ++SI) {
+        PtrState &SuccS = BBStates[*SI].getPtrBottomUpState(Arg);
+        switch (SuccS.GetSeq()) {
+        case S_None: {
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            S.ClearSequenceProgress();
+          continue;
+        }
         case S_CanRelease:
           SomeSuccHasSame = true;
           break;
@@ -2174,16 +2245,18 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
         case S_Release:
         case S_MovableRelease:
         case S_Use:
-          AllSuccsHaveSame = false;
+          if (!S.RRI.KnownSafe && !SuccS.RRI.KnownSafe)
+            AllSuccsHaveSame = false;
           break;
         case S_Retain:
           llvm_unreachable("bottom-up pointer in retain state!");
         }
+      }
       // If the state at the other end of any of the successor edges
       // matches the current state, require all edges to match. This
       // guards against loops in the middle of a sequence.
       if (SomeSuccHasSame && !AllSuccsHaveSame)
-        MyStates.getPtrTopDownState(Arg).ClearSequenceProgress();
+        S.ClearSequenceProgress();
     }
     }
 }
@@ -2207,6 +2280,8 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       if (Succ == BB)
         continue;
       DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+      // If we haven't seen this node yet, then we've found a CFG cycle.
+      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
       if (I == BBStates.end())
         continue;
       MyStates.InitFromSucc(I->second);
@@ -2245,11 +2320,12 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
       S.SetSeqToRelease(Inst->getMetadata(ImpreciseReleaseMDKind));
       S.RRI.clear();
-      S.RRI.KnownIncremented = S.IsKnownIncremented();
+      S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
       S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
       S.RRI.Calls.insert(Inst);
 
       S.IncrementRefCount();
+      S.IncrementNestCount();
       break;
     }
     case IC_RetainBlock:
@@ -2259,6 +2335,13 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
       PtrState &S = MyStates.getPtrBottomUpState(Arg);
       S.DecrementRefCount();
+      S.SetAtLeastOneRefCount();
+      S.DecrementNestCount();
+
+      // An objc_retainBlock call with just a use still needs to be kept,
+      // because it may be copying a block from the stack to the heap.
+      if (Class == IC_RetainBlock && S.GetSeq() == S_Use)
+        S.SetSeq(S_CanRelease);
 
       switch (S.GetSeq()) {
       case S_Stop:
@@ -2281,7 +2364,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       case S_Retain:
         llvm_unreachable("bottom-up pointer in retain state!");
       }
-      break;
+      continue;
     }
     case IC_AutoreleasepoolPop:
       // Conservatively, clear MyStates for all known pointers.
@@ -2305,26 +2388,22 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       PtrState &S = MI->second;
       Sequence Seq = S.GetSeq();
 
-      // Check for possible retains and releases.
+      // Check for possible releases.
       if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        // Check for a retain (we're going bottom-up here).
         S.DecrementRefCount();
-
-        // Check for a release.
-        if (!IsRetain(Class) && Class != IC_RetainBlock)
-          switch (Seq) {
-          case S_Use:
-            S.SetSeq(S_CanRelease);
-            continue;
-          case S_CanRelease:
-          case S_Release:
-          case S_MovableRelease:
-          case S_Stop:
-          case S_None:
-            break;
-          case S_Retain:
-            llvm_unreachable("bottom-up pointer in retain state!");
-          }
+        switch (Seq) {
+        case S_Use:
+          S.SetSeq(S_CanRelease);
+          continue;
+        case S_CanRelease:
+        case S_Release:
+        case S_MovableRelease:
+        case S_Stop:
+        case S_None:
+          break;
+        case S_Retain:
+          llvm_unreachable("bottom-up pointer in retain state!");
+        }
       }
 
       // Check for possible direct uses.
@@ -2332,14 +2411,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       case S_Release:
       case S_MovableRelease:
         if (CanUse(Inst, Ptr, PA, Class)) {
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
           S.SetSeq(S_Use);
         } else if (Seq == S_Release &&
                    (Class == IC_User || Class == IC_CallOrUser)) {
           // Non-movable releases depend on any possible objc pointer use.
           S.SetSeq(S_Stop);
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
         }
         break;
@@ -2378,14 +2457,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       if (Pred == BB)
         continue;
       DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
-      if (I == BBStates.end())
+      assert(I != BBStates.end());
+      // If we haven't seen this node yet, then we've found a CFG cycle.
+      // Be optimistic here; it's CheckForCFGHazards' job detect trouble.
+      if (!I->second.isVisitedTopDown())
         continue;
       MyStates.InitFromPred(I->second);
       while (PI != PE) {
         Pred = *PI++;
         if (Pred != BB) {
           I = BBStates.find(Pred);
-          if (I != BBStates.end())
+          assert(I != BBStates.end());
+          if (I->second.isVisitedTopDown())
             MyStates.MergePred(I->second);
         }
       }
@@ -2422,18 +2505,23 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
         S.SetSeq(S_Retain);
         S.RRI.clear();
         S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-        S.RRI.KnownIncremented = S.IsKnownIncremented();
+        // Don't check S.IsKnownIncremented() here because it's not
+        // sufficient.
+        S.RRI.KnownSafe = S.IsKnownNested();
         S.RRI.Calls.insert(Inst);
       }
 
+      S.SetAtLeastOneRefCount();
       S.IncrementRefCount();
-      break;
+      S.IncrementNestCount();
+      continue;
     }
     case IC_Release: {
       Arg = GetObjCArg(Inst);
 
       PtrState &S = MyStates.getPtrTopDownState(Arg);
       S.DecrementRefCount();
+      S.DecrementNestCount();
 
       switch (S.GetSeq()) {
       case S_Retain:
@@ -2478,16 +2566,12 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       Sequence Seq = S.GetSeq();
 
       // Check for possible releases.
-      if (!IsRetain(Class) && Class != IC_RetainBlock &&
-          CanAlterRefCount(Inst, Ptr, PA, Class)) {
-        // Check for a release.
+      if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
         S.DecrementRefCount();
-
-        // Check for a release.
         switch (Seq) {
         case S_Retain:
           S.SetSeq(S_CanRelease);
-          S.RRI.ReverseInsertPts.clear();
+          assert(S.RRI.ReverseInsertPts.empty());
           S.RRI.ReverseInsertPts.insert(Inst);
 
           // One call can't cause a transition from S_Retain to S_CanRelease
@@ -2511,8 +2595,18 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
         if (CanUse(Inst, Ptr, PA, Class))
           S.SetSeq(S_Use);
         break;
-      case S_Use:
       case S_Retain:
+        // An objc_retainBlock call may be responsible for copying the block
+        // data from the stack to the heap. Model this by moving it straight
+        // from S_Retain to S_Use.
+        if (S.RRI.IsRetainBlock &&
+            CanUse(Inst, Ptr, PA, Class)) {
+          assert(S.RRI.ReverseInsertPts.empty());
+          S.RRI.ReverseInsertPts.insert(Inst);
+          S.SetSeq(S_Use);
+        }
+        break;
+      case S_Use:
       case S_None:
         break;
       case S_Stop:
@@ -2533,28 +2627,43 @@ ObjCARCOpt::Visit(Function &F,
                   DenseMap<const BasicBlock *, BBState> &BBStates,
                   MapVector<Value *, RRInfo> &Retains,
                   DenseMap<Value *, RRInfo> &Releases) {
-  // Use postorder for bottom-up, and reverse-postorder for top-down, because we
+  // Use reverse-postorder on the reverse CFG for bottom-up, because we
   // magically know that loops will be well behaved, i.e. they won't repeatedly
-  // call retain on a single pointer without doing a release.
+  // call retain on a single pointer without doing a release. We can't use
+  // ReversePostOrderTraversal here because we want to walk up from each
+  // function exit point.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> Stack;
+  SmallVector<BasicBlock *, 16> Order;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *BB = I;
+    if (BB->getTerminator()->getNumSuccessors() == 0)
+      Stack.push_back(std::make_pair(BB, pred_begin(BB)));
+  }
+  while (!Stack.empty()) {
+    pred_iterator End = pred_end(Stack.back().first);
+    while (Stack.back().second != End) {
+      BasicBlock *BB = *Stack.back().second++;
+      if (Visited.insert(BB))
+        Stack.push_back(std::make_pair(BB, pred_begin(BB)));
+    }
+    Order.push_back(Stack.pop_back_val().first);
+  }
   bool BottomUpNestingDetected = false;
-  SmallVector<BasicBlock *, 8> PostOrder;
-  for (po_iterator<Function *> I = po_begin(&F), E = po_end(&F); I != E; ++I) {
+  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator I =
+         Order.rbegin(), E = Order.rend(); I != E; ++I) {
     BasicBlock *BB = *I;
-    PostOrder.push_back(BB);
-
     BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
   }
 
-  // Iterate through the post-order in reverse order, achieving a
-  // reverse-postorder traversal. We don't use the ReversePostOrderTraversal
-  // class here because it works by computing its own full postorder iteration,
-  // recording the sequence, and playing it back in reverse. Since we're already
-  // doing a full iteration above, we can just record the sequence manually and
-  // avoid the cost of having ReversePostOrderTraversal compute it.
+  // Use regular reverse-postorder for top-down.
   bool TopDownNestingDetected = false;
-  for (SmallVectorImpl<BasicBlock *>::const_reverse_iterator
-       RI = PostOrder.rbegin(), RE = PostOrder.rend(); RI != RE; ++RI)
-    TopDownNestingDetected |= VisitTopDown(*RI, BBStates, Releases);
+  typedef ReversePostOrderTraversal<Function *> RPOTType;
+  RPOTType RPOT(&F);
+  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
+  }
 
   return TopDownNestingDetected && BottomUpNestingDetected;
 }
@@ -2565,12 +2674,10 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
                            RRInfo &ReleasesToMove,
                            MapVector<Value *, RRInfo> &Retains,
                            DenseMap<Value *, RRInfo> &Releases,
-                           SmallVectorImpl<Instruction *> &DeadInsts) {
-  const Type *ArgTy = Arg->getType();
-  const Type *ParamTy =
-    (RetainRVFunc ? RetainRVFunc :
-     RetainFunc ? RetainFunc :
-     RetainBlockFunc)->arg_begin()->getType();
+                           SmallVectorImpl<Instruction *> &DeadInsts,
+                           Module *M) {
+  Type *ArgTy = Arg->getType();
+  Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
 
   // Insert the new retain and release calls.
   for (SmallPtrSet<Instruction *, 2>::const_iterator
@@ -2581,7 +2688,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
     CallInst *Call =
       CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         RetainBlockFunc : RetainFunc,
+                         getRetainBlockCallee(M) : getRetainCallee(M),
                        MyArg, "", InsertPt);
     Call->setDoesNotThrow();
     if (!RetainsToMove.IsRetainBlock)
@@ -2598,8 +2705,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
       // The invoke's return value isn't available in the unwind block,
       // but our releases will never depend on it, because they must be
       // paired with retains from before the invoke.
-      InsertPts[0] = II->getNormalDest()->getFirstNonPHI();
-      InsertPts[1] = II->getUnwindDest()->getFirstNonPHI();
+      InsertPts[0] = II->getNormalDest()->getFirstInsertionPt();
+      InsertPts[1] = II->getUnwindDest()->getFirstInsertionPt();
     } else {
       // Insert code immediately after the last use.
       InsertPts[0] = llvm::next(BasicBlock::iterator(LastUse));
@@ -2609,7 +2716,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
       Instruction *InsertPt = *I;
       Value *MyArg = ArgTy == ParamTy ? Arg :
                      new BitCastInst(Arg, ParamTy, "", InsertPt);
-      CallInst *Call = CallInst::Create(ReleaseFunc, MyArg, "", InsertPt);
+      CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
+                                        "", InsertPt);
       // Attach a clang.imprecise_release metadata tag, if appropriate.
       if (MDNode *M = ReleasesToMove.ReleaseMetadata)
         Call->setMetadata(ImpreciseReleaseMDKind, M);
@@ -2640,7 +2748,8 @@ bool
 ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
                                    &BBStates,
                                  MapVector<Value *, RRInfo> &Retains,
-                                 DenseMap<Value *, RRInfo> &Releases) {
+                                 DenseMap<Value *, RRInfo> &Releases,
+                                 Module *M) {
   bool AnyPairsCompletelyEliminated = false;
   RRInfo RetainsToMove;
   RRInfo ReleasesToMove;
@@ -2649,21 +2758,36 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
   SmallVector<Instruction *, 8> DeadInsts;
 
   for (MapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
-       E = Retains.end(); I != E; ) {
-    Value *V = (I++)->first;
+       E = Retains.end(); I != E; ++I) {
+    Value *V = I->first;
     if (!V) continue; // blotted
 
     Instruction *Retain = cast<Instruction>(V);
     Value *Arg = GetObjCArg(Retain);
 
-    // If the object being released is in static or stack storage, we know it's
+    // If the object being released is in static storage, we know it's
     // not being managed by ObjC reference counting, so we can delete pairs
     // regardless of what possible decrements or uses lie between them.
-    bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+    bool KnownSafe = isa<Constant>(Arg);
+   
+    // Same for stack storage, unless this is an objc_retainBlock call,
+    // which is responsible for copying the block data from the stack to
+    // the heap.
+    if (!I->second.IsRetainBlock && isa<AllocaInst>(Arg))
+      KnownSafe = true;
+
+    // A constant pointer can't be pointing to an object on the heap. It may
+    // be reference-counted, but it won't be deleted.
+    if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+      if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(
+              StripPointerCastsAndObjCCalls(LI->getPointerOperand())))
+        if (GV->isConstant())
+          KnownSafe = true;
 
     // If a pair happens in a region where it is known that the reference count
     // is already incremented, we can similarly ignore possible decrements.
-    bool KnownIncrementedTD = true, KnownIncrementedBU = true;
+    bool KnownSafeTD = true, KnownSafeBU = true;
 
     // Connect the dots between the top-down-collected RetainsToMove and
     // bottom-up-collected ReleasesToMove to form sets of related calls.
@@ -2683,7 +2807,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
         MapVector<Value *, RRInfo>::const_iterator It = Retains.find(NewRetain);
         assert(It != Retains.end());
         const RRInfo &NewRetainRRI = It->second;
-        KnownIncrementedTD &= NewRetainRRI.KnownIncremented;
+        KnownSafeTD &= NewRetainRRI.KnownSafe;
         for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewRetainRRI.Calls.begin(),
              LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
@@ -2739,7 +2863,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
           Releases.find(NewRelease);
         assert(It != Releases.end());
         const RRInfo &NewReleaseRRI = It->second;
-        KnownIncrementedBU &= NewReleaseRRI.KnownIncremented;
+        KnownSafeBU &= NewReleaseRRI.KnownSafe;
         for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewReleaseRRI.Calls.begin(),
              LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
@@ -2787,12 +2911,19 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
       if (NewRetains.empty()) break;
     }
 
-    // If the pointer is known incremented, we can safely delete the pair
-    // regardless of what's between them.
-    if (KnownIncrementedTD || KnownIncrementedBU) {
+    // If the pointer is known incremented or nested, we can safely delete the
+    // pair regardless of what's between them.
+    if (KnownSafeTD || KnownSafeBU) {
       RetainsToMove.ReverseInsertPts.clear();
       ReleasesToMove.ReverseInsertPts.clear();
       NewCount = 0;
+    } else {
+      // Determine whether the new insertion points we computed preserve the
+      // balance of retain and release calls through the program.
+      // TODO: If the fully aggressive solution isn't valid, try to find a
+      // less aggressive solution which is.
+      if (NewDelta != 0)
+        goto next_retain;
     }
 
     // Determine whether the original call points are balanced in the retain and
@@ -2803,18 +2934,12 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
     if (OldDelta != 0)
       goto next_retain;
 
-    // Determine whether the new insertion points we computed preserve the
-    // balance of retain and release calls through the program.
-    // TODO: If the fully aggressive solution isn't valid, try to find a
-    // less aggressive solution which is.
-    if (NewDelta != 0)
-      goto next_retain;
-
     // Ok, everything checks out and we're all set. Let's move some code!
     Changed = true;
     AnyPairsCompletelyEliminated = NewCount == 0;
     NumRRs += OldCount - NewCount;
-    MoveCalls(Arg, RetainsToMove, ReleasesToMove, Retains, Releases, DeadInsts);
+    MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+              Retains, Releases, DeadInsts, M);
 
   next_retain:
     NewReleases.clear();
@@ -2993,7 +3118,8 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
   bool NestingDetected = Visit(F, BBStates, Retains, Releases);
 
   // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases) && NestingDetected;
+  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
+         NestingDetected;
 }
 
 /// OptimizeReturns - Look for this pattern:
@@ -3072,7 +3198,8 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
 
         // Check that there is nothing that can affect the reference
         // count between the retain and the call.
-        FindDependencies(CanChangeRetainCount, Arg, BB, Retain,
+        // Note that Retain need not be in BB.
+        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
                          DependingInstructions, Visited, PA);
         if (DependingInstructions.size() != 1)
           goto next_block;
@@ -3117,12 +3244,6 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   ImpreciseReleaseMDKind =
     M.getContext().getMDKindID("clang.imprecise_release");
 
-  // Identify the declarations for objc_retain and friends.
-  RetainFunc = M.getFunction("objc_retain");
-  RetainBlockFunc = M.getFunction("objc_retainBlock");
-  RetainRVFunc = M.getFunction("objc_retainAutoreleasedReturnValue");
-  ReleaseFunc = M.getFunction("objc_release");
-
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
   // calls finalizers.
@@ -3132,6 +3253,7 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   AutoreleaseRVCallee = 0;
   ReleaseCallee = 0;
   RetainCallee = 0;
+  RetainBlockCallee = 0;
   AutoreleaseCallee = 0;
 
   return false;
@@ -3294,7 +3416,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -3310,7 +3432,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     std::vector<Type *> Params;
     Params.push_back(I8X);
-    const FunctionType *FTy =
+    FunctionType *FTy =
       FunctionType::get(I8X, Params, /*isVarArg=*/false);
     AttrListPtr Attributes;
     Attributes.addAttr(~0u, Attribute::NoUnwind);
@@ -3377,7 +3499,7 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
 void ObjCARCContract::ContractRelease(Instruction *Release,
                                       inst_iterator &Iter) {
   LoadInst *Load = dyn_cast<LoadInst>(GetObjCArg(Release));
-  if (!Load || Load->isVolatile()) return;
+  if (!Load || !Load->isSimple()) return;
 
   // For now, require everything to be in one basic block.
   BasicBlock *BB = Release->getParent();
@@ -3393,7 +3515,7 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
           !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod)))
     ++I;
   StoreInst *Store = dyn_cast<StoreInst>(I);
-  if (!Store || Store->isVolatile()) return;
+  if (!Store || !Store->isSimple()) return;
   if (Store->getPointerOperand() != Loc.Ptr) return;
 
   Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand());
@@ -3411,8 +3533,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
   ++NumStoreStrongs;
 
   LLVMContext &C = Release->getContext();
-  const Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-  const Type *I8XX = PointerType::getUnqual(I8X);
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
 
   Value *Args[] = { Load->getPointerOperand(), New };
   if (Args[0]->getType() != I8XX)
@@ -3548,7 +3670,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
           if (Inst != UserInst && DT->dominates(Inst, UserInst)) {
             Changed = true;
             Instruction *Replacement = Inst;
-            const Type *UseTy = U.get()->getType();
+            Type *UseTy = U.get()->getType();
             if (PHINode *PHI = dyn_cast<PHINode>(UserInst)) {
               // For PHI nodes, insert the bitcast in the predecessor block.
               unsigned ValNo =
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index e6341ae..8f98a5b 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -309,7 +309,7 @@ void Reassociate::LinearizeExprTree(BinaryOperator *I,
     std::swap(LHS, RHS);
     bool Success = !I->swapOperands();
     assert(Success && "swapOperands failed");
-    Success = false;
+    (void)Success;
     MadeChange = true;
   } else if (RHSBO) {
     // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the RHS is not
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 083412e..196a847 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -156,7 +156,7 @@ namespace {
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
   const TargetData *TD;
-  SmallPtrSet<BasicBlock*, 8> BBExecutable;// The BBs that are executable.
+  SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
 
   /// StructValueState - This maintains ValueState for values that have
@@ -241,7 +241,7 @@ public:
   /// this method must be called.
   void AddTrackedFunction(Function *F) {
     // Add an entry, F -> undef.
-    if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+    if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
       MRVFunctionsTracked.insert(F);
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
@@ -302,7 +302,7 @@ public:
   /// markAnythingOverdefined - Mark the specified value overdefined.  This
   /// works with both scalars and structs.
   void markAnythingOverdefined(Value *V) {
-    if (const StructType *STy = dyn_cast<StructType>(V->getType()))
+    if (StructType *STy = dyn_cast<StructType>(V->getType()))
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         markOverdefined(getStructValueState(V, i), V);
     else
@@ -417,7 +417,7 @@ private:
       else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C))
         LV.markConstant(CS->getOperand(i));      // Constants are constant.
       else if (isa<ConstantAggregateZero>(C)) {
-        const Type *FieldTy = cast<StructType>(V->getType())->getElementType(i);
+        Type *FieldTy = cast<StructType>(V->getType())->getElementType(i);
         LV.markConstant(Constant::getNullValue(FieldTy));
       } else
         LV.markOverdefined();      // Unknown sort of constant.
@@ -471,9 +471,9 @@ private:
   /// UsersOfOverdefinedPHIs map for PN, remove them now.
   void RemoveFromOverdefinedPHIs(Instruction *I, PHINode *PN) {
     if (UsersOfOverdefinedPHIs.empty()) return;
-    std::multimap<PHINode*, Instruction*>::iterator It, E;
-    tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN);
-    while (It != E) {
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
+    for (ItTy It = Range.first, E = Range.second; It != E;) {
       if (It->second == I)
         UsersOfOverdefinedPHIs.erase(It++);
       else
@@ -486,9 +486,9 @@ private:
   /// (Duplicate entries do not break anything directly, but can lead to
   /// exponential growth of the table in rare cases.)
   void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) {
-    std::multimap<PHINode*, Instruction*>::iterator J, E;
-    tie(J, E) = UsersOfOverdefinedPHIs.equal_range(PN);
-    for (; J != E; ++J)
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(PN);
+    for (ItTy J = Range.first, E = Range.second; J != E; ++J)
       if (J->second == I)
         return;
     UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I));
@@ -515,6 +515,7 @@ private:
   void visitShuffleVectorInst(ShuffleVectorInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
+  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
 
   // Instructions that cannot be folded away.
   void visitStoreInst     (StoreInst &I);
@@ -528,8 +529,12 @@ private:
     visitTerminatorInst(II);
   }
   void visitCallSite      (CallSite CS);
+  void visitResumeInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitFenceInst     (FenceInst &I) { /*returns void*/ }
+  void visitAtomicCmpXchgInst (AtomicCmpXchgInst &I) { markOverdefined(&I); }
+  void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
   void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
 
@@ -577,6 +582,10 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
   }
   
   if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    if (TI.getNumSuccessors() < 2) {
+      Succs[0] = true;
+      return;
+    }
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
     
@@ -637,6 +646,9 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     return true;
   
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumSuccessors() < 2)
+      return true;
+
     LatticeVal SCValue = getValueState(SI->getCondition());
     ConstantInt *CI = SCValue.getConstantInt();
     
@@ -692,13 +704,14 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     // There may be instructions using this PHI node that are not overdefined
     // themselves.  If so, make sure that they know that the PHI node operand
     // changed.
-    std::multimap<PHINode*, Instruction*>::iterator I, E;
-    tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN);
-    if (I == E)
+    typedef std::multimap<PHINode*, Instruction*>::iterator ItTy;
+    std::pair<ItTy, ItTy> Range = UsersOfOverdefinedPHIs.equal_range(&PN);
+    
+    if (Range.first == Range.second)
       return;
     
     SmallVector<Instruction*, 16> Users;
-    for (; I != E; ++I)
+    for (ItTy I = Range.first, E = Range.second; I != E; ++I)
       Users.push_back(I->second);
     while (!Users.empty())
       visit(Users.pop_back_val());
@@ -772,7 +785,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
   
   // Handle functions that return multiple values.
   if (!TrackedMultipleRetVals.empty()) {
-    if (const StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
+    if (StructType *STy = dyn_cast<StructType>(ResultOp->getType()))
       if (MRVFunctionsTracked.count(F))
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
           mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
@@ -825,7 +838,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
 }
 
 void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
-  const StructType *STy = dyn_cast<StructType>(IVI.getType());
+  StructType *STy = dyn_cast<StructType>(IVI.getType());
   if (STy == 0)
     return markOverdefined(&IVI);
   
@@ -925,7 +938,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) {
         // Could annihilate value.
         if (I.getOpcode() == Instruction::And)
           markConstant(IV, &I, Constant::getNullValue(I.getType()));
-        else if (const VectorType *PT = dyn_cast<VectorType>(I.getType()))
+        else if (VectorType *PT = dyn_cast<VectorType>(I.getType()))
           markConstant(IV, &I, Constant::getAllOnesValue(PT));
         else
           markConstant(IV, &I,
@@ -1179,8 +1192,8 @@ void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
   }
 
   Constant *Ptr = Operands[0];
-  markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0]+1,
-                                                  Operands.size()-1));
+  ArrayRef<Constant *> Indices(Operands.begin() + 1, Operands.end());
+  markConstant(&I, ConstantExpr::getGetElementPtr(Ptr, Indices));
 }
 
 void SCCPSolver::visitStoreInst(StoreInst &SI) {
@@ -1278,7 +1291,7 @@ CallOverdefined:
      
       // If we can constant fold this, mark the result of the call as a
       // constant.
-      if (Constant *C = ConstantFoldCall(F, Operands.data(), Operands.size()))
+      if (Constant *C = ConstantFoldCall(F, Operands))
         return markConstant(I, C);
     }
 
@@ -1303,7 +1316,7 @@ CallOverdefined:
         continue;
       }
       
-      if (const StructType *STy = dyn_cast<StructType>(AI->getType())) {
+      if (StructType *STy = dyn_cast<StructType>(AI->getType())) {
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
           LatticeVal CallArg = getStructValueState(*CAI, i);
           mergeInValue(getStructValueState(AI, i), AI, CallArg);
@@ -1315,7 +1328,7 @@ CallOverdefined:
   }
   
   // If this is a single/zero retval case, see if we're tracking the function.
-  if (const StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
+  if (StructType *STy = dyn_cast<StructType>(F->getReturnType())) {
     if (!MRVFunctionsTracked.count(F))
       goto CallOverdefined;  // Not tracking this callee.
     
@@ -1419,67 +1432,116 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // Look for instructions which produce undef values.
       if (I->getType()->isVoidTy()) continue;
       
-      if (const StructType *STy = dyn_cast<StructType>(I->getType())) {
-        // Only a few things that can be structs matter for undef.  Just send
-        // all their results to overdefined.  We could be more precise than this
-        // but it isn't worth bothering.
-        if (isa<CallInst>(I) || isa<SelectInst>(I)) {
-          for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-            LatticeVal &LV = getStructValueState(I, i);
-            if (LV.isUndefined())
-              markOverdefined(LV, I);
-          }
+      if (StructType *STy = dyn_cast<StructType>(I->getType())) {
+        // Only a few things that can be structs matter for undef.
+
+        // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+        if (CallSite CS = CallSite(I))
+          if (Function *F = CS.getCalledFunction())
+            if (MRVFunctionsTracked.count(F))
+              continue;
+
+        // extractvalue and insertvalue don't need to be marked; they are
+        // tracked as precisely as their operands. 
+        if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+          continue;
+
+        // Send the results of everything else to overdefined.  We could be
+        // more precise than this but it isn't worth bothering.
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          LatticeVal &LV = getStructValueState(I, i);
+          if (LV.isUndefined())
+            markOverdefined(LV, I);
         }
         continue;
       }
-      
+
       LatticeVal &LV = getValueState(I);
       if (!LV.isUndefined()) continue;
 
-      // No instructions using structs need disambiguation.
-      if (I->getOperand(0)->getType()->isStructTy())
+      // extractvalue is safe; check here because the argument is a struct.
+      if (isa<ExtractValueInst>(I))
         continue;
 
-      // Get the lattice values of the first two operands for use below.
+      // Compute the operand LatticeVals, for convenience below.
+      // Anything taking a struct is conservatively assumed to require
+      // overdefined markings.
+      if (I->getOperand(0)->getType()->isStructTy()) {
+        markOverdefined(I);
+        return true;
+      }
       LatticeVal Op0LV = getValueState(I->getOperand(0));
       LatticeVal Op1LV;
       if (I->getNumOperands() == 2) {
-        // No instructions using structs need disambiguation.
-        if (I->getOperand(1)->getType()->isStructTy())
-          continue;
-        
-        // If this is a two-operand instruction, and if both operands are
-        // undefs, the result stays undef.
+        if (I->getOperand(1)->getType()->isStructTy()) {
+          markOverdefined(I);
+          return true;
+        }
+
         Op1LV = getValueState(I->getOperand(1));
-        if (Op0LV.isUndefined() && Op1LV.isUndefined())
-          continue;
       }
-      
       // If this is an instructions whose result is defined even if the input is
       // not fully defined, propagate the information.
-      const Type *ITy = I->getType();
+      Type *ITy = I->getType();
       switch (I->getOpcode()) {
-      default: break;          // Leave the instruction as an undef.
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+        break; // Any undef -> undef
+      case Instruction::FSub:
+      case Instruction::FAdd:
+      case Instruction::FMul:
+      case Instruction::FDiv:
+      case Instruction::FRem:
+        // Floating-point binary operation: be conservative.
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          markForcedConstant(I, Constant::getNullValue(ITy));
+        else
+          markOverdefined(I);
+        return true;
       case Instruction::ZExt:
-        // After a zero extend, we know the top part is zero.  SExt doesn't have
-        // to be handled here, because we don't know whether the top part is 1's
-        // or 0's.
-      case Instruction::SIToFP:  // some FP values are not possible, just use 0.
-      case Instruction::UIToFP:  // some FP values are not possible, just use 0.
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+        // undef -> 0; some outputs are impossible
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Mul:
       case Instruction::And:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
         // undef * X -> 0.   X could be zero.
         // undef & X -> 0.   X could be zero.
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
 
       case Instruction::Or:
+        // Both operands undef -> undef
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          break;
         // undef | X -> -1.   X could be -1.
         markForcedConstant(I, Constant::getAllOnesValue(ITy));
         return true;
 
+      case Instruction::Xor:
+        // undef ^ undef -> 0; strictly speaking, this is not strictly
+        // necessary, but we try to be nice to people who expect this
+        // behavior in simple cases
+        if (Op0LV.isUndefined() && Op1LV.isUndefined()) {
+          markForcedConstant(I, Constant::getNullValue(ITy));
+          return true;
+        }
+        // undef ^ X -> undef
+        break;
+
       case Instruction::SDiv:
       case Instruction::UDiv:
       case Instruction::SRem:
@@ -1494,26 +1556,24 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         return true;
         
       case Instruction::AShr:
-        // undef >>s X -> undef.  No change.
-        if (Op0LV.isUndefined()) break;
-        
-        // X >>s undef -> X.  X could be 0, X could have the high-bit known set.
-        if (Op0LV.isConstant())
-          markForcedConstant(I, Op0LV.getConstant());
-        else
-          markOverdefined(I);
+        // X >>a undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef >>a X -> all ones
+        markForcedConstant(I, Constant::getAllOnesValue(ITy));
         return true;
       case Instruction::LShr:
       case Instruction::Shl:
-        // undef >> X -> undef.  No change.
-        // undef << X -> undef.  No change.
-        if (Op0LV.isUndefined()) break;
-        
-        // X >> undef -> 0.  X could be 0.
-        // X << undef -> 0.  X could be 0.
+        // X << undef -> undef.
+        // X >> undef -> undef.
+        if (Op1LV.isUndefined()) break;
+
+        // undef << X -> 0
+        // undef >> X -> 0
         markForcedConstant(I, Constant::getNullValue(ITy));
         return true;
       case Instruction::Select:
+        Op1LV = getValueState(I->getOperand(1));
         // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
         if (Op0LV.isUndefined()) {
           if (!Op1LV.isConstant())  // Pick the constant one if there is any.
@@ -1533,9 +1593,35 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         else
           markOverdefined(I);
         return true;
+      case Instruction::Load:
+        // A load here means one of two things: a load of undef from a global,
+        // a load from an unknown pointer.  Either way, having it return undef
+        // is okay.
+        break;
+      case Instruction::ICmp:
+        // X == undef -> undef.  Other comparisons get more complicated.
+        if (cast<ICmpInst>(I)->isEquality())
+          break;
+        markOverdefined(I);
+        return true;
       case Instruction::Call:
-        // If a call has an undef result, it is because it is constant foldable
-        // but one of the inputs was undef.  Just force the result to
+      case Instruction::Invoke: {
+        // There are two reasons a call can have an undef result
+        // 1. It could be tracked.
+        // 2. It could be constant-foldable.
+        // Because of the way we solve return values, tracked calls must
+        // never be marked overdefined in ResolvedUndefsIn.
+        if (Function *F = CallSite(I).getCalledFunction())
+          if (TrackedRetVals.count(F))
+            break;
+
+        // If the call is constant-foldable, we mark it overdefined because
+        // we do not know what return values are valid.
+        markOverdefined(I);
+        return true;
+      }
+      default:
+        // If we don't know what should happen here, conservatively mark it
         // overdefined.
         markOverdefined(I);
         return true;
@@ -1621,15 +1707,25 @@ FunctionPass *llvm::createSCCPPass() {
 static void DeleteInstructionInBlock(BasicBlock *BB) {
   DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
   ++NumDeadBlocks;
-  
-  // Delete the instructions backwards, as it has a reduced likelihood of
-  // having to update as many def-use and use-def chains.
-  while (!isa<TerminatorInst>(BB->begin())) {
-    Instruction *I = --BasicBlock::iterator(BB->getTerminator());
-    
-    if (!I->use_empty())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
-    BB->getInstList().erase(I);
+
+  // Check to see if there are non-terminating instructions to delete.
+  if (isa<TerminatorInst>(BB->begin()))
+    return;
+
+  // Delete the instructions backwards, as it has a reduced likelihood of having
+  // to update as many def-use and use-def chains.
+  Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+  while (EndInst != BB->begin()) {
+    // Delete the next to last instruction.
+    BasicBlock::iterator I = EndInst;
+    Instruction *Inst = --I;
+    if (!Inst->use_empty())
+      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+    if (isa<LandingPadInst>(Inst)) {
+      EndInst = Inst;
+      continue;
+    }
+    BB->getInstList().erase(Inst);
     ++NumInstRemoved;
   }
 }
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 302c287..f6918de 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -63,7 +63,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeCFGSimplifyPassPass(Registry);
   initializeSimplifyLibCallsPass(Registry);
   initializeSinkingPass(Registry);
-  initializeTailDupPass(Registry);
   initializeTailCallElimPass(Registry);
 }
 
@@ -187,3 +186,7 @@ void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
 void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBasicAliasAnalysisPass());
 }
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 7d6349c..c6d9123 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -129,11 +129,11 @@ namespace {
                                          AllocaInfo &Info);
     void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info);
     void isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                         const Type *MemOpType, bool isStore, AllocaInfo &Info,
+                         Type *MemOpType, bool isStore, AllocaInfo &Info,
                          Instruction *TheAccess, bool AllowWholeAccess);
-    bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size);
-    uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset,
-                                  const Type *&IdxTy);
+    bool TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size);
+    uint64_t FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                  Type *&IdxTy);
 
     void DoScalarReplacement(AllocaInst *AI,
                              std::vector<AllocaInst*> &WorkList);
@@ -145,6 +145,9 @@ namespace {
                         SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                     SmallVector<AllocaInst*, 32> &NewElts);
+    void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                  uint64_t Offset,
+                                  SmallVector<AllocaInst*, 32> &NewElts);
     void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                       AllocaInst *AI,
                                       SmallVector<AllocaInst*, 32> &NewElts);
@@ -253,7 +256,7 @@ class ConvertToScalarInfo {
   /// VectorTy - This tracks the type that we should promote the vector to if
   /// it is possible to turn it into a vector.  This starts out null, and if it
   /// isn't possible to turn into a vector type, it gets set to VoidTy.
-  const VectorType *VectorTy;
+  VectorType *VectorTy;
 
   /// HadNonMemTransferAccess - True if there is at least one access to the 
   /// alloca that is not a MemTransferInst.  We don't want to turn structs into
@@ -269,11 +272,11 @@ public:
 
 private:
   bool CanConvertToScalar(Value *V, uint64_t Offset);
-  void MergeInTypeForLoadOrStore(const Type *In, uint64_t Offset);
-  bool MergeInVectorType(const VectorType *VInTy, uint64_t Offset);
+  void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset);
+  bool MergeInVectorType(VectorType *VInTy, uint64_t Offset);
   void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
 
-  Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType,
+  Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType,
                                     uint64_t Offset, IRBuilder<> &Builder);
   Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal,
                                    uint64_t Offset, IRBuilder<> &Builder);
@@ -295,8 +298,6 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   if (ScalarKind == Unknown)
     ScalarKind = Integer;
 
-  // FIXME: It should be possible to promote the vector type up to the alloca's
-  // size.
   if (ScalarKind == Vector && VectorTy->getBitWidth() != AllocaSize * 8)
     ScalarKind = Integer;
 
@@ -306,7 +307,7 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
   // random stuff that doesn't use vectors (e.g. <9 x double>) because then
   // we just get a lot of insert/extracts.  If at least one vector is
   // involved, then we probably really do have a union of vector/array.
-  const Type *NewTy;
+  Type *NewTy;
   if (ScalarKind == Vector) {
     assert(VectorTy && "Missing type for vector scalar.");
     DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n  TYPE = "
@@ -331,20 +332,16 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) {
 /// (VectorTy) so far at the offset specified by Offset (which is specified in
 /// bytes).
 ///
-/// There are three cases we handle here:
+/// There are two cases we handle here:
 ///   1) A union of vector types of the same size and potentially its elements.
 ///      Here we turn element accesses into insert/extract element operations.
 ///      This promotes a <4 x float> with a store of float to the third element
 ///      into a <4 x float> that uses insert element.
-///   2) A union of vector types with power-of-2 size differences, e.g. a float,
-///      <2 x float> and <4 x float>.  Here we turn element accesses into insert
-///      and extract element operations, and <2 x float> accesses into a cast to
-///      <2 x double>, an extract, and a cast back to <2 x float>.
-///   3) A fully general blob of memory, which we turn into some (potentially
+///   2) A fully general blob of memory, which we turn into some (potentially
 ///      large) integer type with extract and insert operations where the loads
 ///      and stores would mutate the memory.  We mark this by setting VectorTy
 ///      to VoidTy.
-void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
+void ConvertToScalarInfo::MergeInTypeForLoadOrStore(Type *In,
                                                     uint64_t Offset) {
   // If we already decided to turn this into a blob of integer memory, there is
   // nothing to be done.
@@ -355,7 +352,7 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
 
   // If the In type is a vector that is the same size as the alloca, see if it
   // matches the existing VecTy.
-  if (const VectorType *VInTy = dyn_cast<VectorType>(In)) {
+  if (VectorType *VInTy = dyn_cast<VectorType>(In)) {
     if (MergeInVectorType(VInTy, Offset))
       return;
   } else if (In->isFloatTy() || In->isDoubleTy() ||
@@ -371,20 +368,13 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
     // if the implied vector agrees with what we already have and if Offset is
     // compatible with it.
     if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 &&
-        (!VectorTy || Offset * 8 < VectorTy->getPrimitiveSizeInBits())) {
+        (!VectorTy || EltSize == VectorTy->getElementType()
+                                         ->getPrimitiveSizeInBits()/8)) {
       if (!VectorTy) {
         ScalarKind = ImplicitVector;
         VectorTy = VectorType::get(In, AllocaSize/EltSize);
-        return;
       }
-
-      unsigned CurrentEltSize = VectorTy->getElementType()
-                                ->getPrimitiveSizeInBits()/8;
-      if (EltSize == CurrentEltSize)
-        return;
-
-      if (In->isIntegerTy() && isPowerOf2_32(AllocaSize / EltSize))
-        return;
+      return;
     }
   }
 
@@ -395,74 +385,21 @@ void ConvertToScalarInfo::MergeInTypeForLoadOrStore(const Type *In,
 
 /// MergeInVectorType - Handles the vector case of MergeInTypeForLoadOrStore,
 /// returning true if the type was successfully merged and false otherwise.
-bool ConvertToScalarInfo::MergeInVectorType(const VectorType *VInTy,
+bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy,
                                             uint64_t Offset) {
-  // TODO: Support nonzero offsets?
-  if (Offset != 0)
-    return false;
-
-  // Only allow vectors that are a power-of-2 away from the size of the alloca.
-  if (!isPowerOf2_64(AllocaSize / (VInTy->getBitWidth() / 8)))
-    return false;
-
-  // If this the first vector we see, remember the type so that we know the
-  // element size.
-  if (!VectorTy) {
+  if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) {
+    // If we're storing/loading a vector of the right size, allow it as a
+    // vector.  If this the first vector we see, remember the type so that
+    // we know the element size. If this is a subsequent access, ignore it
+    // even if it is a differing type but the same size. Worst case we can
+    // bitcast the resultant vectors.
+    if (!VectorTy)
+      VectorTy = VInTy;
     ScalarKind = Vector;
-    VectorTy = VInTy;
     return true;
   }
 
-  unsigned BitWidth = VectorTy->getBitWidth();
-  unsigned InBitWidth = VInTy->getBitWidth();
-
-  // Vectors of the same size can be converted using a simple bitcast.
-  if (InBitWidth == BitWidth && AllocaSize == (InBitWidth / 8)) {
-    ScalarKind = Vector;
-    return true;
-  }
-
-  const Type *ElementTy = VectorTy->getElementType();
-  const Type *InElementTy = VInTy->getElementType();
-
-  // Do not allow mixed integer and floating-point accesses from vectors of
-  // different sizes.
-  if (ElementTy->isFloatingPointTy() != InElementTy->isFloatingPointTy())
-    return false;
-
-  if (ElementTy->isFloatingPointTy()) {
-    // Only allow floating-point vectors of different sizes if they have the
-    // same element type.
-    // TODO: This could be loosened a bit, but would anything benefit?
-    if (ElementTy != InElementTy)
-      return false;
-
-    // There are no arbitrary-precision floating-point types, which limits the
-    // number of legal vector types with larger element types that we can form
-    // to bitcast and extract a subvector.
-    // TODO: We could support some more cases with mixed fp128 and double here.
-    if (!(BitWidth == 64 || BitWidth == 128) ||
-        !(InBitWidth == 64 || InBitWidth == 128))
-      return false;
-  } else {
-    assert(ElementTy->isIntegerTy() && "Vector elements must be either integer "
-                                       "or floating-point.");
-    unsigned BitWidth = ElementTy->getPrimitiveSizeInBits();
-    unsigned InBitWidth = InElementTy->getPrimitiveSizeInBits();
-
-    // Do not allow integer types smaller than a byte or types whose widths are
-    // not a multiple of a byte.
-    if (BitWidth < 8 || InBitWidth < 8 ||
-        BitWidth % 8 != 0 || InBitWidth % 8 != 0)
-      return false;
-  }
-
-  // Pick the largest of the two vector types.
-  ScalarKind = Vector;
-  if (InBitWidth > BitWidth)
-    VectorTy = VInTy;
-
-  return true;
+  return false;
 }
 
 /// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
@@ -480,7 +417,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       // Don't break volatile loads.
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return false;
       // Don't touch MMX operations.
       if (LI->getType()->isX86_MMXTy())
@@ -492,7 +429,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
 
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Storing the pointer, not into the value?
-      if (SI->getOperand(0) == V || SI->isVolatile()) return false;
+      if (SI->getOperand(0) == V || !SI->isSimple()) return false;
       // Don't touch MMX operations.
       if (SI->getOperand(0)->getType()->isX86_MMXTy())
         return false;
@@ -502,7 +439,8 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
     }
 
     if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
-      IsNotTrivial = true;  // Can't be mem2reg'd.
+      if (!onlyUsedByLifetimeMarkers(BCI))
+        IsNotTrivial = true;  // Can't be mem2reg'd.
       if (!CanConvertToScalar(BCI, Offset))
         return false;
       continue;
@@ -516,7 +454,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
-                                               &Indices[0], Indices.size());
+                                               Indices);
       // See if all uses can be converted.
       if (!CanConvertToScalar(GEP, Offset+GEPOffset))
         return false;
@@ -560,6 +498,14 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) {
       continue;
     }
 
+    // If this is a lifetime intrinsic, we can handle it.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        continue;
+      }
+    }
+
     // Otherwise, we cannot handle this!
     return false;
   }
@@ -589,7 +535,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       // Compute the offset that this GEP adds to the pointer.
       SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
       uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(),
-                                               &Indices[0], Indices.size());
+                                               Indices);
       ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
       GEP->eraseFromParent();
       continue;
@@ -599,7 +545,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
 
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
       // The load is a bit extract from NewAI shifted right by Offset bits.
-      Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp");
+      Value *LoadedVal = Builder.CreateLoad(NewAI);
       Value *NewLoadVal
         = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder);
       LI->replaceAllUsesWith(NewLoadVal);
@@ -668,8 +614,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         // pointer (bitcasted), then a store to our new alloca.
         assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?");
         Value *SrcPtr = MTI->getSource();
-        const PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
-        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        PointerType* SPTy = cast<PointerType>(SrcPtr->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
         if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
           AIPTy = PointerType::get(AIPTy->getElementType(),
                                    SPTy->getAddressSpace());
@@ -685,8 +631,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
         assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?");
         LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval");
 
-        const PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
-        const PointerType* AIPTy = cast<PointerType>(NewAI->getType());
+        PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType());
+        PointerType* AIPTy = cast<PointerType>(NewAI->getType());
         if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) {
           AIPTy = PointerType::get(AIPTy->getElementType(),
                                    DPTy->getAddressSpace());
@@ -703,65 +649,18 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI,
       continue;
     }
 
-    llvm_unreachable("Unsupported operation!");
-  }
-}
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        // There's no need to preserve these, as the resulting alloca will be
+        // converted to a register anyways.
+        II->eraseFromParent();
+        continue;
+      }
+    }
 
-/// getScaledElementType - Gets a scaled element type for a partial vector
-/// access of an alloca. The input types must be integer or floating-point
-/// scalar or vector types, and the resulting type is an integer, float or
-/// double.
-static const Type *getScaledElementType(const Type *Ty1, const Type *Ty2,
-                                        unsigned NewBitWidth) {
-  bool IsFP1 = Ty1->isFloatingPointTy() ||
-               (Ty1->isVectorTy() &&
-                cast<VectorType>(Ty1)->getElementType()->isFloatingPointTy());
-  bool IsFP2 = Ty2->isFloatingPointTy() ||
-               (Ty2->isVectorTy() &&
-                cast<VectorType>(Ty2)->getElementType()->isFloatingPointTy());
-
-  LLVMContext &Context = Ty1->getContext();
-
-  // Prefer floating-point types over integer types, as integer types may have
-  // been created by earlier scalar replacement.
-  if (IsFP1 || IsFP2) {
-    if (NewBitWidth == 32)
-      return Type::getFloatTy(Context);
-    if (NewBitWidth == 64)
-      return Type::getDoubleTy(Context);
+    llvm_unreachable("Unsupported operation!");
   }
-
-  return Type::getIntNTy(Context, NewBitWidth);
-}
-
-/// CreateShuffleVectorCast - Creates a shuffle vector to convert one vector
-/// to another vector of the same element type which has the same allocation
-/// size but different primitive sizes (e.g. <3 x i32> and <4 x i32>).
-static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType,
-                                      IRBuilder<> &Builder) {
-  const Type *FromType = FromVal->getType();
-  const VectorType *FromVTy = cast<VectorType>(FromType);
-  const VectorType *ToVTy = cast<VectorType>(ToType);
-  assert((ToVTy->getElementType() == FromVTy->getElementType()) &&
-         "Vectors must have the same element type");
-   Value *UnV = UndefValue::get(FromType);
-   unsigned numEltsFrom = FromVTy->getNumElements();
-   unsigned numEltsTo = ToVTy->getNumElements();
-
-   SmallVector<Constant*, 3> Args;
-   const Type* Int32Ty = Builder.getInt32Ty();
-   unsigned minNumElts = std::min(numEltsFrom, numEltsTo);
-   unsigned i;
-   for (i=0; i != minNumElts; ++i)
-     Args.push_back(ConstantInt::get(Int32Ty, i));
-
-   if (i < numEltsTo) {
-     Constant* UnC = UndefValue::get(Int32Ty);
-     for (; i != numEltsTo; ++i)
-       Args.push_back(UnC);
-   }
-   Constant *Mask = ConstantVector::get(Args);
-   return Builder.CreateShuffleVector(FromVal, UnV, Mask, "tmpV");
 }
 
 /// ConvertScalar_ExtractValue - Extract a value of type ToType from an integer
@@ -775,50 +674,20 @@ static Value *CreateShuffleVectorCast(Value *FromVal, const Type *ToType,
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.
 Value *ConvertToScalarInfo::
-ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
+ConvertScalar_ExtractValue(Value *FromVal, Type *ToType,
                            uint64_t Offset, IRBuilder<> &Builder) {
   // If the load is of the whole new alloca, no conversion is needed.
-  const Type *FromType = FromVal->getType();
+  Type *FromType = FromVal->getType();
   if (FromType == ToType && Offset == 0)
     return FromVal;
 
   // If the result alloca is a vector type, this is either an element
   // access or a bitcast to another vector type of the same size.
-  if (const VectorType *VTy = dyn_cast<VectorType>(FromType)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(FromType)) {
     unsigned FromTypeSize = TD.getTypeAllocSize(FromType);
     unsigned ToTypeSize = TD.getTypeAllocSize(ToType);
-    if (FromTypeSize == ToTypeSize) {
-      // If the two types have the same primitive size, use a bit cast.
-      // Otherwise, it is two vectors with the same element type that has
-      // the same allocation size but different number of elements so use
-      // a shuffle vector.
-      if (FromType->getPrimitiveSizeInBits() ==
-          ToType->getPrimitiveSizeInBits())
-        return Builder.CreateBitCast(FromVal, ToType, "tmp");
-      else
-        return CreateShuffleVectorCast(FromVal, ToType, Builder);
-    }
-
-    if (isPowerOf2_64(FromTypeSize / ToTypeSize)) {
-      assert(!(ToType->isVectorTy() && Offset != 0) && "Can't extract a value "
-             "of a smaller vector type at a nonzero offset.");
-
-      const Type *CastElementTy = getScaledElementType(FromType, ToType,
-                                                       ToTypeSize * 8);
-      unsigned NumCastVectorElements = FromTypeSize / ToTypeSize;
-
-      LLVMContext &Context = FromVal->getContext();
-      const Type *CastTy = VectorType::get(CastElementTy,
-                                           NumCastVectorElements);
-      Value *Cast = Builder.CreateBitCast(FromVal, CastTy, "tmp");
-
-      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
-      unsigned Elt = Offset/EltSize;
-      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
-      Value *Extract = Builder.CreateExtractElement(Cast, ConstantInt::get(
-                                        Type::getInt32Ty(Context), Elt), "tmp");
-      return Builder.CreateBitCast(Extract, ToType, "tmp");
-    }
+    if (FromTypeSize == ToTypeSize)
+        return Builder.CreateBitCast(FromVal, ToType);
 
     // Otherwise it must be an element access.
     unsigned Elt = 0;
@@ -828,40 +697,39 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
       assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
     }
     // Return the element extracted out of it.
-    Value *V = Builder.CreateExtractElement(FromVal, ConstantInt::get(
-                    Type::getInt32Ty(FromVal->getContext()), Elt), "tmp");
+    Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt));
     if (V->getType() != ToType)
-      V = Builder.CreateBitCast(V, ToType, "tmp");
+      V = Builder.CreateBitCast(V, ToType);
     return V;
   }
 
   // If ToType is a first class aggregate, extract out each of the pieces and
   // use insertvalue's to form the FCA.
-  if (const StructType *ST = dyn_cast<StructType>(ToType)) {
+  if (StructType *ST = dyn_cast<StructType>(ToType)) {
     const StructLayout &Layout = *TD.getStructLayout(ST);
     Value *Res = UndefValue::get(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i),
                                         Offset+Layout.getElementOffsetInBits(i),
                                               Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
 
-  if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) {
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     Value *Res = UndefValue::get(AT);
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(),
                                               Offset+i*EltSize, Builder);
-      Res = Builder.CreateInsertValue(Res, Elt, i, "tmp");
+      Res = Builder.CreateInsertValue(Res, Elt, i);
     }
     return Res;
   }
 
   // Otherwise, this must be a union that was converted to an integer value.
-  const IntegerType *NTy = cast<IntegerType>(FromVal->getType());
+  IntegerType *NTy = cast<IntegerType>(FromVal->getType());
 
   // If this is a big-endian system and the load is narrower than the
   // full alloca type, we need to do a shift to get the right bits.
@@ -881,33 +749,31 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType,
   // only some bits are used.
   if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
     FromVal = Builder.CreateLShr(FromVal,
-                                 ConstantInt::get(FromVal->getType(),
-                                                           ShAmt), "tmp");
+                                 ConstantInt::get(FromVal->getType(), ShAmt));
   else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
     FromVal = Builder.CreateShl(FromVal,
-                                ConstantInt::get(FromVal->getType(),
-                                                          -ShAmt), "tmp");
+                                ConstantInt::get(FromVal->getType(), -ShAmt));
 
   // Finally, unconditionally truncate the integer to the right width.
   unsigned LIBitWidth = TD.getTypeSizeInBits(ToType);
   if (LIBitWidth < NTy->getBitWidth())
     FromVal =
       Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
   else if (LIBitWidth > NTy->getBitWidth())
     FromVal =
        Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(),
-                                                    LIBitWidth), "tmp");
+                                                    LIBitWidth));
 
   // If the result is an integer, this is a trunc or bitcast.
   if (ToType->isIntegerTy()) {
     // Should be done.
   } else if (ToType->isFloatingPointTy() || ToType->isVectorTy()) {
     // Just do a bitcast, we know the sizes match up.
-    FromVal = Builder.CreateBitCast(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateBitCast(FromVal, ToType);
   } else {
     // Otherwise must be a pointer.
-    FromVal = Builder.CreateIntToPtr(FromVal, ToType, "tmp");
+    FromVal = Builder.CreateIntToPtr(FromVal, ToType);
   }
   assert(FromVal->getType() == ToType && "Didn't convert right?");
   return FromVal;
@@ -927,65 +793,30 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
                           uint64_t Offset, IRBuilder<> &Builder) {
   // Convert the stored type to the actual type, shift it left to insert
   // then 'or' into place.
-  const Type *AllocaType = Old->getType();
+  Type *AllocaType = Old->getType();
   LLVMContext &Context = Old->getContext();
 
-  if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
+  if (VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
     uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy);
     uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType());
 
     // Changing the whole vector with memset or with an access of a different
     // vector type?
-    if (ValSize == VecSize) {
-      // If the two types have the same primitive size, use a bit cast.
-      // Otherwise, it is two vectors with the same element type that has
-      // the same allocation size but different number of elements so use
-      // a shuffle vector.
-      if (VTy->getPrimitiveSizeInBits() ==
-          SV->getType()->getPrimitiveSizeInBits())
-        return Builder.CreateBitCast(SV, AllocaType, "tmp");
-      else
-        return CreateShuffleVectorCast(SV, VTy, Builder);
-    }
-
-    if (isPowerOf2_64(VecSize / ValSize)) {
-      assert(!(SV->getType()->isVectorTy() && Offset != 0) && "Can't insert a "
-             "value of a smaller vector type at a nonzero offset.");
-
-      const Type *CastElementTy = getScaledElementType(VTy, SV->getType(),
-                                                       ValSize);
-      unsigned NumCastVectorElements = VecSize / ValSize;
-
-      LLVMContext &Context = SV->getContext();
-      const Type *OldCastTy = VectorType::get(CastElementTy,
-                                              NumCastVectorElements);
-      Value *OldCast = Builder.CreateBitCast(Old, OldCastTy, "tmp");
-
-      Value *SVCast = Builder.CreateBitCast(SV, CastElementTy, "tmp");
-
-      unsigned EltSize = TD.getTypeAllocSizeInBits(CastElementTy);
-      unsigned Elt = Offset/EltSize;
-      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
-      Value *Insert =
-        Builder.CreateInsertElement(OldCast, SVCast, ConstantInt::get(
-                                        Type::getInt32Ty(Context), Elt), "tmp");
-      return Builder.CreateBitCast(Insert, AllocaType, "tmp");
-    }
+    if (ValSize == VecSize)
+        return Builder.CreateBitCast(SV, AllocaType);
 
     // Must be an element insertion.
     assert(SV->getType() == VTy->getElementType());
     uint64_t EltSize = TD.getTypeAllocSizeInBits(VTy->getElementType());
     unsigned Elt = Offset/EltSize;
-    return Builder.CreateInsertElement(Old, SV,
-                     ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt),
-                                     "tmp");
+    return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt));
   }
 
   // If SV is a first-class aggregate value, insert each value recursively.
-  if (const StructType *ST = dyn_cast<StructType>(SV->getType())) {
+  if (StructType *ST = dyn_cast<StructType>(SV->getType())) {
     const StructLayout &Layout = *TD.getStructLayout(ST);
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old,
                                       Offset+Layout.getElementOffsetInBits(i),
                                       Builder);
@@ -993,10 +824,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
     return Old;
   }
 
-  if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) {
     uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType());
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
-      Value *Elt = Builder.CreateExtractValue(SV, i, "tmp");
+      Value *Elt = Builder.CreateExtractValue(SV, i);
       Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder);
     }
     return Old;
@@ -1009,20 +840,19 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   unsigned SrcStoreWidth = TD.getTypeStoreSizeInBits(SV->getType());
   unsigned DestStoreWidth = TD.getTypeStoreSizeInBits(AllocaType);
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
-    SV = Builder.CreateBitCast(SV,
-                            IntegerType::get(SV->getContext(),SrcWidth), "tmp");
+    SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()), "tmp");
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
     if (SV->getType()->getPrimitiveSizeInBits() <
              AllocaType->getPrimitiveSizeInBits())
-      SV = Builder.CreateZExt(SV, AllocaType, "tmp");
+      SV = Builder.CreateZExt(SV, AllocaType);
     else {
       // Truncation may be needed if storing more than the alloca can hold
       // (undefined behavior).
-      SV = Builder.CreateTrunc(SV, AllocaType, "tmp");
+      SV = Builder.CreateTrunc(SV, AllocaType);
       SrcWidth = DestWidth;
       SrcStoreWidth = DestStoreWidth;
     }
@@ -1045,12 +875,10 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   // only some bits in the structure are set.
   APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
   if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
-    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(),
-                           ShAmt), "tmp");
+    SV = Builder.CreateShl(SV, ConstantInt::get(SV->getType(), ShAmt));
     Mask <<= ShAmt;
   } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
-    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(),
-                            -ShAmt), "tmp");
+    SV = Builder.CreateLShr(SV, ConstantInt::get(SV->getType(), -ShAmt));
     Mask = Mask.lshr(-ShAmt);
   }
 
@@ -1196,7 +1024,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
   for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
     
     // Both operands to the select need to be dereferencable, either absolutely
     // (e.g. allocas) or at this point because we can see other accesses to it.
@@ -1237,7 +1065,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
   for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
        UI != UE; ++UI) {
     LoadInst *LI = dyn_cast<LoadInst>(*UI);
-    if (LI == 0 || LI->isVolatile()) return false;
+    if (LI == 0 || !LI->isSimple()) return false;
     
     // For now we only allow loads in the same block as the PHI.  This is a
     // common case that happens when instcombine merges two loads through a PHI.
@@ -1258,17 +1086,21 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
   // trapping load in the predecessor if it is a critical edge.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *Pred = PN->getIncomingBlock(i);
+    Value *InVal = PN->getIncomingValue(i);
+
+    // If the terminator of the predecessor has side-effects (an invoke),
+    // there is no safe place to put a load in the predecessor.
+    if (Pred->getTerminator()->mayHaveSideEffects())
+      return false;
+
+    // If the value is produced by the terminator of the predecessor
+    // (an invoke), there is no valid place to put a load in the predecessor.
+    if (Pred->getTerminator() == InVal)
+      return false;
 
     // If the predecessor has a single successor, then the edge isn't critical.
     if (Pred->getTerminator()->getNumSuccessors() == 1)
       continue;
-    
-    Value *InVal = PN->getIncomingValue(i);
-    
-    // If the InVal is an invoke in the pred, we can't put a load on the edge.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(InVal))
-      if (II->getParent() == Pred)
-        return false;
 
     // If this pointer is always safe to load, or if we can prove that there is
     // already a load in the block, then we can move the load to the pred block.
@@ -1295,13 +1127,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
        UI != UE; ++UI) {
     User *U = *UI;
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return false;
       continue;
     }
     
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      if (SI->getOperand(0) == AI || SI->isVolatile())
+      if (SI->getOperand(0) == AI || !SI->isSimple())
         return false;   // Don't allow a store OF the AI, only INTO the AI.
       continue;
     }
@@ -1343,6 +1175,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       continue;
     }
     
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      if (onlyUsedByLifetimeMarkers(BCI)) {
+        InstsToRewrite.insert(BCI);
+        continue;
+      }
+    }
+    
     return false;
   }
 
@@ -1354,6 +1193,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
   // If we have instructions that need to be rewritten for this to be promotable
   // take care of it now.
   for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) {
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(InstsToRewrite[i])) {
+      // This could only be a bitcast used by nothing but lifetime intrinsics.
+      for (BitCastInst::use_iterator I = BCI->use_begin(), E = BCI->use_end();
+           I != E;) {
+        Use &U = I.getUse();
+        ++I;
+        cast<Instruction>(U.getUser())->eraseFromParent();
+      }
+      BCI->eraseFromParent();
+      continue;
+    }
+
     if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) {
       // Selects in InstsToRewrite only have load uses.  Rewrite each as two
       // loads with a new select.
@@ -1393,7 +1244,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
       continue;
     }
     
-    const Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
+    Type *LoadTy = cast<PointerType>(PN->getType())->getElementType();
     PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(),
                                      PN->getName()+".ld", PN);
 
@@ -1483,13 +1334,13 @@ bool SROA::performPromotion(Function &F) {
 /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for
 /// SROA.  It must be a struct or array type with a small number of elements.
 static bool ShouldAttemptScalarRepl(AllocaInst *AI) {
-  const Type *T = AI->getAllocatedType();
+  Type *T = AI->getAllocatedType();
   // Do not promote any struct into more than 32 separate vars.
-  if (const StructType *ST = dyn_cast<StructType>(T))
+  if (StructType *ST = dyn_cast<StructType>(T))
     return ST->getNumElements() <= 32;
   // Arrays are much less likely to be safe for SROA; only consider
   // them if they are very small.
-  if (const ArrayType *AT = dyn_cast<ArrayType>(T))
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
     return AT->getNumElements() <= 8;
   return false;
 }
@@ -1594,7 +1445,7 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
                                std::vector<AllocaInst*> &WorkList) {
   DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n');
   SmallVector<AllocaInst*, 32> ElementAllocas;
-  if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+  if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
     ElementAllocas.reserve(ST->getNumContainedTypes());
     for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
       AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0,
@@ -1604,9 +1455,9 @@ void SROA::DoScalarReplacement(AllocaInst *AI,
       WorkList.push_back(NA);  // Add to worklist for recursive processing
     }
   } else {
-    const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
     ElementAllocas.reserve(AT->getNumElements());
-    const Type *ElTy = AT->getElementType();
+    Type *ElTy = AT->getElementType();
     for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
       AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
                                       AI->getName() + "." + Twine(i), AI);
@@ -1670,22 +1521,26 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset,
                       UI.getOperandNo() == 0, Info, MI,
                       true /*AllowWholeAccess*/);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return MarkUnsafe(Info, User);
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
         
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
         
-      const Type *SIType = SI->getOperand(0)->getType();
+      Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, true /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return MarkUnsafe(Info, User);
     } else if (isa<PHINode>(User) || isa<SelectInst>(User)) {
       isSafePHISelectUseForScalarRepl(User, Offset, Info);
     } else {
@@ -1725,19 +1580,19 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset,
         return MarkUnsafe(Info, User);
       isSafePHISelectUseForScalarRepl(GEPI, Offset, Info);
     } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      if (LI->isVolatile())
+      if (!LI->isSimple())
         return MarkUnsafe(Info, User);
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType),
                       LIType, false, Info, LI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
       
     } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Store is ok if storing INTO the pointer, not storing the pointer
-      if (SI->isVolatile() || SI->getOperand(0) == I)
+      if (!SI->isSimple() || SI->getOperand(0) == I)
         return MarkUnsafe(Info, User);
       
-      const Type *SIType = SI->getOperand(0)->getType();
+      Type *SIType = SI->getOperand(0)->getType();
       isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType),
                       SIType, true, Info, SI, false /*AllowWholeAccess*/);
       Info.hasALoadOrStore = true;
@@ -1776,8 +1631,7 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
   // Compute the offset due to this GEP and check if the alloca has a
   // component element at that offset.
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
-                                 &Indices[0], Indices.size());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
   if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0))
     MarkUnsafe(Info, GEPI);
 }
@@ -1786,14 +1640,14 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
 /// elements of the same type (which is always true for arrays).  If so,
 /// return true with NumElts and EltTy set to the number of elements and the
 /// element type, respectively.
-static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts,
-                                   const Type *&EltTy) {
-  if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
+static bool isHomogeneousAggregate(Type *T, unsigned &NumElts,
+                                   Type *&EltTy) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     NumElts = AT->getNumElements();
     EltTy = (NumElts == 0 ? 0 : AT->getElementType());
     return true;
   }
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
     NumElts = ST->getNumContainedTypes();
     EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0));
     for (unsigned n = 1; n < NumElts; ++n) {
@@ -1807,12 +1661,12 @@ static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts,
 
 /// isCompatibleAggregate - Check if T1 and T2 are either the same type or are
 /// "homogeneous" aggregates with the same element type and number of elements.
-static bool isCompatibleAggregate(const Type *T1, const Type *T2) {
+static bool isCompatibleAggregate(Type *T1, Type *T2) {
   if (T1 == T2)
     return true;
 
   unsigned NumElts1, NumElts2;
-  const Type *EltTy1, *EltTy2;
+  Type *EltTy1, *EltTy2;
   if (isHomogeneousAggregate(T1, NumElts1, EltTy1) &&
       isHomogeneousAggregate(T2, NumElts2, EltTy2) &&
       NumElts1 == NumElts2 &&
@@ -1830,7 +1684,7 @@ static bool isCompatibleAggregate(const Type *T1, const Type *T2) {
 /// If AllowWholeAccess is true, then this allows uses of the entire alloca as a
 /// unit.  If false, it only allows accesses known to be in a single element.
 void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
-                           const Type *MemOpType, bool isStore,
+                           Type *MemOpType, bool isStore,
                            AllocaInfo &Info, Instruction *TheAccess,
                            bool AllowWholeAccess) {
   // Check if this is a load/store of the entire alloca.
@@ -1857,7 +1711,7 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
     }
   }
   // Check if the offset/size correspond to a component within the alloca type.
-  const Type *T = Info.AI->getAllocatedType();
+  Type *T = Info.AI->getAllocatedType();
   if (TypeHasComponent(T, Offset, MemSize)) {
     Info.hasSubelementAccess = true;
     return;
@@ -1868,16 +1722,16 @@ void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize,
 
 /// TypeHasComponent - Return true if T has a component type with the
 /// specified offset and size.  If Size is zero, do not check the size.
-bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) {
-  const Type *EltTy;
+bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
+  Type *EltTy;
   uint64_t EltSize;
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
     const StructLayout *Layout = TD->getStructLayout(ST);
     unsigned EltIdx = Layout->getElementContainingOffset(Offset);
     EltTy = ST->getContainedType(EltIdx);
     EltSize = TD->getTypeAllocSize(EltTy);
     Offset -= Layout->getElementOffset(EltIdx);
-  } else if (const ArrayType *AT = dyn_cast<ArrayType>(T)) {
+  } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) {
     EltTy = AT->getElementType();
     EltSize = TD->getTypeAllocSize(EltTy);
     if (Offset >= AT->getNumElements() * EltSize)
@@ -1924,9 +1778,17 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
       // address operand will be updated, so nothing else needs to be done.
       continue;
     }
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(User)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        RewriteLifetimeIntrinsic(II, AI, Offset, NewElts);
+      }
+      continue;
+    }
     
     if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      const Type *LIType = LI->getType();
+      Type *LIType = LI->getType();
       
       if (isCompatibleAggregate(LIType, AI->getAllocatedType())) {
         // Replace:
@@ -1956,7 +1818,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
     
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       Value *Val = SI->getOperand(0);
-      const Type *SIType = Val->getType();
+      Type *SIType = Val->getType();
       if (isCompatibleAggregate(SIType, AI->getAllocatedType())) {
         // Replace:
         //   store { i32, i32 } %val, { i32, i32 }* %alloc
@@ -2026,10 +1888,10 @@ void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
 /// Sets T to the type of the element and Offset to the offset within that
 /// element.  IdxTy is set to the type of the index result to be used in a
 /// GEP instruction.
-uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
-                                    const Type *&IdxTy) {
+uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
+                                    Type *&IdxTy) {
   uint64_t Idx = 0;
-  if (const StructType *ST = dyn_cast<StructType>(T)) {
+  if (StructType *ST = dyn_cast<StructType>(T)) {
     const StructLayout *Layout = TD->getStructLayout(ST);
     Idx = Layout->getElementContainingOffset(Offset);
     T = ST->getContainedType(Idx);
@@ -2037,7 +1899,7 @@ uint64_t SROA::FindElementAndOffset(const Type *&T, uint64_t &Offset,
     IdxTy = Type::getInt32Ty(T->getContext());
     return Idx;
   }
-  const ArrayType *AT = cast<ArrayType>(T);
+  ArrayType *AT = cast<ArrayType>(T);
   T = AT->getElementType();
   uint64_t EltSize = TD->getTypeAllocSize(T);
   Idx = Offset / EltSize;
@@ -2053,13 +1915,12 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
                       SmallVector<AllocaInst*, 32> &NewElts) {
   uint64_t OldOffset = Offset;
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
-  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(),
-                                 &Indices[0], Indices.size());
+  Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices);
 
   RewriteForScalarRepl(GEPI, AI, Offset, NewElts);
 
-  const Type *T = AI->getAllocatedType();
-  const Type *IdxTy;
+  Type *T = AI->getAllocatedType();
+  Type *IdxTy;
   uint64_t OldIdx = FindElementAndOffset(T, OldOffset, IdxTy);
   if (GEPI->getOperand(0) == AI)
     OldIdx = ~0ULL; // Force the GEP to be rewritten.
@@ -2073,7 +1934,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   if (Idx == OldIdx)
     return;
 
-  const Type *i32Ty = Type::getInt32Ty(AI->getContext());
+  Type *i32Ty = Type::getInt32Ty(AI->getContext());
   SmallVector<Value*, 8> NewArgs;
   NewArgs.push_back(Constant::getNullValue(i32Ty));
   while (EltOffset != 0) {
@@ -2082,8 +1943,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   }
   Instruction *Val = NewElts[Idx];
   if (NewArgs.size() > 1) {
-    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs.begin(),
-                                            NewArgs.end(), "", GEPI);
+    Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI);
     Val->takeName(GEPI);
   }
   if (Val->getType() != GEPI->getType())
@@ -2092,6 +1952,62 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
   DeadInsts.push_back(GEPI);
 }
 
+/// RewriteLifetimeIntrinsic - II is a lifetime.start/lifetime.end. Rewrite it
+/// to mark the lifetime of the scalarized memory.
+void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
+                                    uint64_t Offset,
+                                    SmallVector<AllocaInst*, 32> &NewElts) {
+  ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
+  // Put matching lifetime markers on everything from Offset up to
+  // Offset+OldSize.
+  Type *AIType = AI->getAllocatedType();
+  uint64_t NewOffset = Offset;
+  Type *IdxTy;
+  uint64_t Idx = FindElementAndOffset(AIType, NewOffset, IdxTy);
+
+  IRBuilder<> Builder(II);
+  uint64_t Size = OldSize->getLimitedValue();
+
+  if (NewOffset) {
+    // Splice the first element and index 'NewOffset' bytes in.  SROA will
+    // split the alloca again later.
+    Value *V = Builder.CreateBitCast(NewElts[Idx], Builder.getInt8PtrTy());
+    V = Builder.CreateGEP(V, Builder.getInt64(NewOffset));
+
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy) - NewOffset;
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(V, Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(V, Builder.getInt64(EltSize));
+    ++Idx;
+  }
+
+  for (; Idx != NewElts.size() && Size; ++Idx) {
+    IdxTy = NewElts[Idx]->getAllocatedType();
+    uint64_t EltSize = TD->getTypeAllocSize(IdxTy);
+    if (EltSize > Size) {
+      EltSize = Size;
+      Size = 0;
+    } else {
+      Size -= EltSize;
+    }
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      Builder.CreateLifetimeStart(NewElts[Idx],
+                                  Builder.getInt64(EltSize));
+    else
+      Builder.CreateLifetimeEnd(NewElts[Idx],
+                                Builder.getInt64(EltSize));
+  }
+  DeadInsts.push_back(II);
+}
+
 /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
 /// Rewrite it to copy or set the elements of the scalarized memory.
 void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
@@ -2139,7 +2055,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
 
     // If the pointer is not the right type, insert a bitcast to the right
     // type.
-    const Type *NewTy =
+    Type *NewTy =
       PointerType::get(AI->getType()->getElementType(), AddrSpace);
 
     if (OtherPtr->getType() != NewTy)
@@ -2159,16 +2075,16 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     if (OtherPtr) {
       Value *Idx[2] = { Zero,
                       ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) };
-      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx, Idx + 2,
+      OtherElt = GetElementPtrInst::CreateInBounds(OtherPtr, Idx,
                                               OtherPtr->getName()+"."+Twine(i),
                                                    MI);
       uint64_t EltOffset;
-      const PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
-      const Type *OtherTy = OtherPtrTy->getElementType();
-      if (const StructType *ST = dyn_cast<StructType>(OtherTy)) {
+      PointerType *OtherPtrTy = cast<PointerType>(OtherPtr->getType());
+      Type *OtherTy = OtherPtrTy->getElementType();
+      if (StructType *ST = dyn_cast<StructType>(OtherTy)) {
         EltOffset = TD->getStructLayout(ST)->getElementOffset(i);
       } else {
-        const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
+        Type *EltTy = cast<SequentialType>(OtherTy)->getElementType();
         EltOffset = TD->getTypeAllocSize(EltTy)*i;
       }
 
@@ -2181,7 +2097,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     }
 
     Value *EltPtr = NewElts[i];
-    const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
+    Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType();
 
     // If we got down to a scalar, insert a load or store as appropriate.
     if (EltTy->isSingleValueType()) {
@@ -2207,7 +2123,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
         } else {
           // If EltTy is a vector type, get the element type.
-          const Type *ValTy = EltTy->getScalarType();
+          Type *ValTy = EltTy->getScalarType();
 
           // Construct an integer with the right value.
           unsigned EltSize = TD->getTypeSizeInBits(ValTy);
@@ -2228,8 +2144,8 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
           assert(StoreVal->getType() == ValTy && "Type mismatch!");
 
           // If the requested value was a vector constant, create it.
-          if (EltTy != ValTy) {
-            unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
+          if (EltTy->isVectorTy()) {
+            unsigned NumElts = cast<VectorType>(EltTy)->getNumElements();
             SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
             StoreVal = ConstantVector::get(Elts);
           }
@@ -2271,7 +2187,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
   // Extract each element out of the integer according to its structure offset
   // and store the element value to the individual alloca.
   Value *SrcVal = SI->getOperand(0);
-  const Type *AllocaEltTy = AI->getAllocatedType();
+  Type *AllocaEltTy = AI->getAllocatedType();
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
 
   IRBuilder<> Builder(SI);
@@ -2286,12 +2202,12 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
 
   // There are two forms here: AI could be an array or struct.  Both cases
   // have different ways to compute the element offset.
-  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
     const StructLayout *Layout = TD->getStructLayout(EltSTy);
 
     for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
       // Get the number of bits to shift SrcVal to get the value.
-      const Type *FieldTy = EltSTy->getElementType(i);
+      Type *FieldTy = EltSTy->getElementType(i);
       uint64_t Shift = Layout->getElementOffsetInBits(i);
 
       if (TD->isBigEndian())
@@ -2327,8 +2243,8 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
     }
 
   } else {
-    const ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
-    const Type *ArrayEltTy = ATy->getElementType();
+    ArrayType *ATy = cast<ArrayType>(AllocaEltTy);
+    Type *ArrayEltTy = ATy->getElementType();
     uint64_t ElementOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
     uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy);
 
@@ -2384,7 +2300,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                         SmallVector<AllocaInst*, 32> &NewElts) {
   // Extract each element out of the NewElts according to its structure offset
   // and form the result value.
-  const Type *AllocaEltTy = AI->getAllocatedType();
+  Type *AllocaEltTy = AI->getAllocatedType();
   uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy);
 
   DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI
@@ -2394,10 +2310,10 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
   // have different ways to compute the element offset.
   const StructLayout *Layout = 0;
   uint64_t ArrayEltBitOffset = 0;
-  if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
+  if (StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) {
     Layout = TD->getStructLayout(EltSTy);
   } else {
-    const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
+    Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType();
     ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy);
   }
 
@@ -2408,14 +2324,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
     // Load the value from the alloca.  If the NewElt is an aggregate, cast
     // the pointer to an integer of the same size before doing the load.
     Value *SrcField = NewElts[i];
-    const Type *FieldTy =
+    Type *FieldTy =
       cast<PointerType>(SrcField->getType())->getElementType();
     uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy);
 
     // Ignore zero sized fields like {}, they obviously contain no data.
     if (FieldSizeBits == 0) continue;
 
-    const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
+    IntegerType *FieldIntTy = IntegerType::get(LI->getContext(),
                                                      FieldSizeBits);
     if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() &&
         !FieldTy->isVectorTy())
@@ -2468,14 +2384,14 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
 /// HasPadding - Return true if the specified type has any structure or
 /// alignment padding in between the elements that would be split apart
 /// by SROA; return false otherwise.
-static bool HasPadding(const Type *Ty, const TargetData &TD) {
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+static bool HasPadding(Type *Ty, const TargetData &TD) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Ty = ATy->getElementType();
     return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
   }
 
   // SROA currently handles only Arrays and Structs.
-  const StructType *STy = cast<StructType>(Ty);
+  StructType *STy = cast<StructType>(Ty);
   const StructLayout *SL = TD.getStructLayout(STy);
   unsigned PrevFieldBitOffset = 0;
   for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
@@ -2530,7 +2446,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
   // and fusion code.
   if (!Info.hasSubelementAccess && Info.hasALoadOrStore) {
     // If the struct/array just has one element, use basic SRoA.
-    if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    if (StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
       if (ST->getNumElements() > 1) return false;
     } else {
       if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1)
@@ -2576,7 +2492,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       // Ignore non-volatile loads, they are always ok.
-      if (LI->isVolatile()) return false;
+      if (!LI->isSimple()) return false;
       continue;
     }
 
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 7c415e5..fbb9465 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -134,7 +134,7 @@ namespace {
 struct StrCatOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strcat" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -184,7 +184,7 @@ struct StrCatOpt : public LibCallOptimization {
 struct StrNCatOpt : public StrCatOpt {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strncat" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -232,7 +232,7 @@ struct StrNCatOpt : public StrCatOpt {
 struct StrChrOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strchr" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -282,7 +282,7 @@ struct StrChrOpt : public LibCallOptimization {
 struct StrRChrOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strrchr" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getReturnType() != B.getInt8PtrTy() ||
         FT->getParamType(0) != FT->getReturnType() ||
@@ -323,7 +323,7 @@ struct StrRChrOpt : public LibCallOptimization {
 struct StrCmpOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strcmp" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
@@ -338,16 +338,17 @@ struct StrCmpOpt : public LibCallOptimization {
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
 
-    if (HasStr1 && Str1.empty()) // strcmp("", x) -> *x
-      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
-
-    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
-      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
-
     // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
     if (HasStr1 && HasStr2)
       return ConstantInt::get(CI->getType(),
-                                     strcmp(Str1.c_str(),Str2.c_str()));
+                              StringRef(Str1).compare(Str2));
+
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
 
     // strcmp(P, "x") -> memcmp(P, "x", 2)
     uint64_t Len1 = GetStringLength(Str1P);
@@ -371,7 +372,7 @@ struct StrCmpOpt : public LibCallOptimization {
 struct StrNCmpOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strncmp" function prototype.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 ||
         !FT->getReturnType()->isIntegerTy(32) ||
         FT->getParamType(0) != FT->getParamType(1) ||
@@ -400,16 +401,20 @@ struct StrNCmpOpt : public LibCallOptimization {
     bool HasStr1 = GetConstantStringInfo(Str1P, Str1);
     bool HasStr2 = GetConstantStringInfo(Str2P, Str2);
 
-    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> *x
-      return B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"), CI->getType());
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2) {
+      StringRef SubStr1 = StringRef(Str1).substr(0, Length);
+      StringRef SubStr2 = StringRef(Str2).substr(0, Length);
+      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+    }
+
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
 
     if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
       return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
 
-    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
-    if (HasStr1 && HasStr2)
-      return ConstantInt::get(CI->getType(),
-                              strncmp(Str1.c_str(), Str2.c_str(), Length));
     return 0;
   }
 };
@@ -426,7 +431,7 @@ struct StrCpyOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Verify the "strcpy" function prototype.
     unsigned NumParams = OptChkCall ? 3 : 2;
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != NumParams ||
         FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
@@ -462,7 +467,7 @@ struct StrCpyOpt : public LibCallOptimization {
 
 struct StrNCpyOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
@@ -511,7 +516,7 @@ struct StrNCpyOpt : public LibCallOptimization {
 
 struct StrLenOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         !FT->getReturnType()->isIntegerTy())
@@ -537,7 +542,7 @@ struct StrLenOpt : public LibCallOptimization {
 
 struct StrPBrkOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
@@ -575,7 +580,7 @@ struct StrPBrkOpt : public LibCallOptimization {
 
 struct StrToOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy())
@@ -597,7 +602,7 @@ struct StrToOpt : public LibCallOptimization {
 
 struct StrSpnOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
@@ -626,7 +631,7 @@ struct StrSpnOpt : public LibCallOptimization {
 
 struct StrCSpnOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         FT->getParamType(0) != B.getInt8PtrTy() ||
         FT->getParamType(1) != FT->getParamType(0) ||
@@ -658,7 +663,7 @@ struct StrCSpnOpt : public LibCallOptimization {
 
 struct StrStrOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
@@ -722,7 +727,7 @@ struct StrStrOpt : public LibCallOptimization {
 
 struct MemCmpOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy(32))
@@ -773,7 +778,7 @@ struct MemCpyOpt : public LibCallOptimization {
     // These optimizations require TargetData.
     if (!TD) return 0;
 
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
@@ -795,7 +800,7 @@ struct MemMoveOpt : public LibCallOptimization {
     // These optimizations require TargetData.
     if (!TD) return 0;
 
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
@@ -817,7 +822,7 @@ struct MemSetOpt : public LibCallOptimization {
     // These optimizations require TargetData.
     if (!TD) return 0;
 
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
@@ -840,7 +845,7 @@ struct MemSetOpt : public LibCallOptimization {
 
 struct PowOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
@@ -874,8 +879,8 @@ struct PowOpt : public LibCallOptimization {
                                          Callee->getAttributes());
       Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
                                          Callee->getAttributes());
-      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf, "tmp");
-      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs, "tmp");
+      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
       return Sel;
     }
 
@@ -895,7 +900,7 @@ struct PowOpt : public LibCallOptimization {
 
 struct Exp2Opt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
@@ -908,10 +913,10 @@ struct Exp2Opt : public LibCallOptimization {
     Value *LdExpArg = 0;
     if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
-        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty(), "tmp");
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
     } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
       if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
-        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty(), "tmp");
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
     }
 
     if (LdExpArg) {
@@ -946,7 +951,7 @@ struct Exp2Opt : public LibCallOptimization {
 
 struct UnaryDoubleFPOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
         !FT->getParamType(0)->isDoubleTy())
       return 0;
@@ -973,7 +978,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization {
 
 struct FFSOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 1 ||
@@ -996,10 +1001,10 @@ struct FFSOpt : public LibCallOptimization {
     Value *F = Intrinsic::getDeclaration(Callee->getParent(),
                                          Intrinsic::cttz, ArgType);
     Value *V = B.CreateCall(F, Op, "cttz");
-    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp");
-    V = B.CreateIntCast(V, B.getInt32Ty(), false, "tmp");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+    V = B.CreateIntCast(V, B.getInt32Ty(), false);
 
-    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp");
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
     return B.CreateSelect(Cond, V, B.getInt32(0));
   }
 };
@@ -1009,7 +1014,7 @@ struct FFSOpt : public LibCallOptimization {
 
 struct IsDigitOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require integer(i32)
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         !FT->getParamType(0)->isIntegerTy(32))
@@ -1028,7 +1033,7 @@ struct IsDigitOpt : public LibCallOptimization {
 
 struct IsAsciiOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require integer(i32)
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         !FT->getParamType(0)->isIntegerTy(32))
@@ -1046,7 +1051,7 @@ struct IsAsciiOpt : public LibCallOptimization {
 
 struct AbsOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require integer(integer) where the types agree.
     if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
         FT->getParamType(0) != FT->getReturnType())
@@ -1067,7 +1072,7 @@ struct AbsOpt : public LibCallOptimization {
 
 struct ToAsciiOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     // We require i32(i32)
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isIntegerTy(32))
@@ -1147,7 +1152,7 @@ struct PrintFOpt : public LibCallOptimization {
 
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require one fixed pointer argument and an integer/void result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
         !(FT->getReturnType()->isIntegerTy() ||
           FT->getReturnType()->isVoidTy()))
@@ -1241,7 +1246,7 @@ struct SPrintFOpt : public LibCallOptimization {
 
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require two fixed pointer arguments and an integer result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy())
@@ -1272,7 +1277,7 @@ struct SPrintFOpt : public LibCallOptimization {
 struct FWriteOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require a pointer, an integer, an integer, a pointer, returning integer.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
         !FT->getParamType(2)->isIntegerTy() ||
@@ -1310,7 +1315,7 @@ struct FPutsOpt : public LibCallOptimization {
     if (!TD) return 0;
 
     // Require two pointers.  Also, we can't optimize if return value is used.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !CI->use_empty())
@@ -1379,7 +1384,7 @@ struct FPrintFOpt : public LibCallOptimization {
 
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require two fixed paramters as pointers and integer result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
         !FT->getReturnType()->isIntegerTy())
@@ -1410,7 +1415,7 @@ struct FPrintFOpt : public LibCallOptimization {
 struct PutsOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     // Require one fixed pointer argument and an integer/void result.
-    const FunctionType *FT = Callee->getFunctionType();
+    FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
         !(FT->getReturnType()->isIntegerTy() ||
           FT->getReturnType()->isVoidTy()))
@@ -1685,7 +1690,7 @@ void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
 
 
 void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
-  const FunctionType *FTy = F.getFunctionType();
+  FunctionType *FTy = F.getFunctionType();
   
   StringRef Name = F.getName();
   switch (Name[0]) {
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 705f442..c83f56c 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -153,9 +153,13 @@ bool Sinking::ProcessBlock(BasicBlock &BB) {
 
 static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
                          SmallPtrSet<Instruction *, 8> &Stores) {
-  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
-    if (L->isVolatile()) return false;
 
+  if (Inst->mayWriteToMemory()) {
+    Stores.insert(Inst);
+    return false;
+  }
+
+  if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
     AliasAnalysis::Location Loc = AA->getLocation(L);
     for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(),
          E = Stores.end(); I != E; ++I)
@@ -163,11 +167,6 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA,
         return false;
   }
 
-  if (Inst->mayWriteToMemory()) {
-    Stores.insert(Inst);
-    return false;
-  }
-
   if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst))
     return false;
 
diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
deleted file mode 100644
index 9dd83c0..0000000
--- a/lib/Transforms/Scalar/TailDuplication.cpp
+++ /dev/null
@@ -1,373 +0,0 @@
-//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs a limited form of tail duplication, intended to simplify
-// CFGs by removing some unconditional branches.  This pass is necessary to
-// straighten out loops created by the C front-end, but also is capable of
-// making other code nicer.  After this pass is run, the CFG simplify pass
-// should be run to clean up the mess.
-//
-// This pass could be enhanced in the future to use profile information to be
-// more aggressive.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "tailduplicate"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Constant.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <map>
-using namespace llvm;
-
-STATISTIC(NumEliminated, "Number of unconditional branches eliminated");
-
-static cl::opt<unsigned>
-TailDupThreshold("taildup-threshold",
-                 cl::desc("Max block size to tail duplicate"),
-                 cl::init(1), cl::Hidden);
-
-namespace {
-  class TailDup : public FunctionPass {
-    bool runOnFunction(Function &F);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    TailDup() : FunctionPass(ID) {
-      initializeTailDupPass(*PassRegistry::getPassRegistry());
-    }
-
-  private:
-    inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned);
-    inline void eliminateUnconditionalBranch(BranchInst *BI);
-    SmallPtrSet<BasicBlock*, 4> CycleDetector;
-  };
-}
-
-char TailDup::ID = 0;
-INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false)
-
-// Public interface to the Tail Duplication pass
-FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); }
-
-/// runOnFunction - Top level algorithm - Loop over each unconditional branch in
-/// the function, eliminating it if it looks attractive enough.  CycleDetector
-/// prevents infinite loops by checking that we aren't redirecting a branch to
-/// a place it already pointed to earlier; see PR 2323.
-bool TailDup::runOnFunction(Function &F) {
-  bool Changed = false;
-  CycleDetector.clear();
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
-    if (shouldEliminateUnconditionalBranch(I->getTerminator(),
-                                           TailDupThreshold)) {
-      eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator()));
-      Changed = true;
-    } else {
-      ++I;
-      CycleDetector.clear();
-    }
-  }
-  return Changed;
-}
-
-/// shouldEliminateUnconditionalBranch - Return true if this branch looks
-/// attractive to eliminate.  We eliminate the branch if the destination basic
-/// block has <= 5 instructions in it, not counting PHI nodes.  In practice,
-/// since one of these is a terminator instruction, this means that we will add
-/// up to 4 instructions to the new block.
-///
-/// We don't count PHI nodes in the count since they will be removed when the
-/// contents of the block are copied over.
-///
-bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI,
-                                                 unsigned Threshold) {
-  BranchInst *BI = dyn_cast<BranchInst>(TI);
-  if (!BI || !BI->isUnconditional()) return false;  // Not an uncond branch!
-
-  BasicBlock *Dest = BI->getSuccessor(0);
-  if (Dest == BI->getParent()) return false;        // Do not loop infinitely!
-
-  // Do not inline a block if we will just get another branch to the same block!
-  TerminatorInst *DTI = Dest->getTerminator();
-  if (BranchInst *DBI = dyn_cast<BranchInst>(DTI))
-    if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest)
-      return false;                                 // Do not loop infinitely!
-
-  // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack,
-  // because doing so would require breaking critical edges.  This should be
-  // fixed eventually.
-  if (!DTI->use_empty())
-    return false;
-
-  // Do not bother with blocks with only a single predecessor: simplify
-  // CFG will fold these two blocks together!
-  pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest);
-  ++PI;
-  if (PI == PE) return false;  // Exactly one predecessor!
-
-  BasicBlock::iterator I = Dest->getFirstNonPHI();
-
-  for (unsigned Size = 0; I != Dest->end(); ++I) {
-    if (Size == Threshold) return false;  // The block is too large.
-    
-    // Don't tail duplicate call instructions.  They are very large compared to
-    // other instructions.
-    if (isa<CallInst>(I) || isa<InvokeInst>(I)) return false;
-
-    // Also alloca and malloc.
-    if (isa<AllocaInst>(I)) return false;
-
-    // Some vector instructions can expand into a number of instructions.
-    if (isa<ShuffleVectorInst>(I) || isa<ExtractElementInst>(I) ||
-        isa<InsertElementInst>(I)) return false;
-    
-    // Only count instructions that are not debugger intrinsics.
-    if (!isa<DbgInfoIntrinsic>(I)) ++Size;
-  }
-
-  // Do not tail duplicate a block that has thousands of successors into a block
-  // with a single successor if the block has many other predecessors.  This can
-  // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in
-  // cases that have a large number of indirect gotos.
-  unsigned NumSuccs = DTI->getNumSuccessors();
-  if (NumSuccs > 8) {
-    unsigned TooMany = 128;
-    if (NumSuccs >= TooMany) return false;
-    TooMany = TooMany/NumSuccs;
-    for (; PI != PE; ++PI)
-      if (TooMany-- == 0) return false;
-  }
-  
-  // If this unconditional branch is a fall-through, be careful about
-  // tail duplicating it.  In particular, we don't want to taildup it if the
-  // original block will still be there after taildup is completed: doing so
-  // would eliminate the fall-through, requiring unconditional branches.
-  Function::iterator DestI = Dest;
-  if (&*--DestI == BI->getParent()) {
-    // The uncond branch is a fall-through.  Tail duplication of the block is
-    // will eliminate the fall-through-ness and end up cloning the terminator
-    // at the end of the Dest block.  Since the original Dest block will
-    // continue to exist, this means that one or the other will not be able to
-    // fall through.  One typical example that this helps with is code like:
-    // if (a)
-    //   foo();
-    // if (b)
-    //   foo();
-    // Cloning the 'if b' block into the end of the first foo block is messy.
-    
-    // The messy case is when the fall-through block falls through to other
-    // blocks.  This is what we would be preventing if we cloned the block.
-    DestI = Dest;
-    if (++DestI != Dest->getParent()->end()) {
-      BasicBlock *DestSucc = DestI;
-      // If any of Dest's successors are fall-throughs, don't do this xform.
-      for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest);
-           SI != SE; ++SI)
-        if (*SI == DestSucc)
-          return false;
-    }
-  }
-
-  // Finally, check that we haven't redirected to this target block earlier;
-  // there are cases where we loop forever if we don't check this (PR 2323).
-  if (!CycleDetector.insert(Dest))
-    return false;
-
-  return true;
-}
-
-/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to
-/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock.  If we
-/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and
-/// DstBlock, return it.
-static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
-                                          BasicBlock *DstBlock) {
-  // SrcBlock must have a single predecessor.
-  pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock);
-  if (PI == PE || ++PI != PE) return 0;
-
-  BasicBlock *SrcPred = *pred_begin(SrcBlock);
-
-  // Look at the predecessors of DstBlock.  One of them will be SrcBlock.  If
-  // there is only one other pred, get it, otherwise we can't handle it.
-  PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
-  BasicBlock *DstOtherPred = 0;
-  BasicBlock *P = *PI;
-  if (P == SrcBlock) {
-    if (++PI == PE) return 0;
-    DstOtherPred = *PI;
-    if (++PI != PE) return 0;
-  } else {
-    DstOtherPred = P;
-    if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
-  }
-
-  // We can handle two situations here: "if then" and "if then else" blocks.  An
-  // 'if then' situation is just where DstOtherPred == SrcPred.
-  if (DstOtherPred == SrcPred)
-    return SrcPred;
-
-  // Check to see if we have an "if then else" situation, which means that
-  // DstOtherPred will have a single predecessor and it will be SrcPred.
-  PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred);
-  if (PI != PE && *PI == SrcPred) {
-    if (++PI != PE) return 0;  // Not a single pred.
-    return SrcPred;  // Otherwise, it's an "if then" situation.  Return the if.
-  }
-
-  // Otherwise, this is something we can't handle.
-  return 0;
-}
-
-
-/// eliminateUnconditionalBranch - Clone the instructions from the destination
-/// block into the source block, eliminating the specified unconditional branch.
-/// If the destination block defines values used by successors of the dest
-/// block, we may need to insert PHI nodes.
-///
-void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) {
-  BasicBlock *SourceBlock = Branch->getParent();
-  BasicBlock *DestBlock = Branch->getSuccessor(0);
-  assert(SourceBlock != DestBlock && "Our predicate is broken!");
-
-  DEBUG(dbgs() << "TailDuplication[" << SourceBlock->getParent()->getName()
-        << "]: Eliminating branch: " << *Branch);
-
-  // See if we can avoid duplicating code by moving it up to a dominator of both
-  // blocks.
-  if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) {
-    DEBUG(dbgs() << "Found shared dominator: " << DomBlock->getName() << "\n");
-
-    // If there are non-phi instructions in DestBlock that have no operands
-    // defined in DestBlock, and if the instruction has no side effects, we can
-    // move the instruction to DomBlock instead of duplicating it.
-    BasicBlock::iterator BBI = DestBlock->getFirstNonPHI();
-    while (!isa<TerminatorInst>(BBI)) {
-      Instruction *I = BBI++;
-
-      bool CanHoist = I->isSafeToSpeculativelyExecute() &&
-                      !I->mayReadFromMemory();
-      if (CanHoist) {
-        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
-          if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op)))
-            if (OpI->getParent() == DestBlock ||
-                (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) {
-              CanHoist = false;
-              break;
-            }
-        if (CanHoist) {
-          // Remove from DestBlock, move right before the term in DomBlock.
-          DestBlock->getInstList().remove(I);
-          DomBlock->getInstList().insert(DomBlock->getTerminator(), I);
-          DEBUG(dbgs() << "Hoisted: " << *I);
-        }
-      }
-    }
-  }
-
-  // Tail duplication can not update SSA properties correctly if the values
-  // defined in the duplicated tail are used outside of the tail itself.  For
-  // this reason, we spill all values that are used outside of the tail to the
-  // stack.
-  for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I)
-    if (I->isUsedOutsideOfBlock(DestBlock)) {
-      // We found a use outside of the tail.  Create a new stack slot to
-      // break this inter-block usage pattern.
-      DemoteRegToStack(*I);
-    }
-
-  // We are going to have to map operands from the original block B to the new
-  // copy of the block B'.  If there are PHI nodes in the DestBlock, these PHI
-  // nodes also define part of this mapping.  Loop over these PHI nodes, adding
-  // them to our mapping.
-  //
-  std::map<Value*, Value*> ValueMapping;
-
-  BasicBlock::iterator BI = DestBlock->begin();
-  bool HadPHINodes = isa<PHINode>(BI);
-  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-    ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock);
-
-  // Clone the non-phi instructions of the dest block into the source block,
-  // keeping track of the mapping...
-  //
-  for (; BI != DestBlock->end(); ++BI) {
-    Instruction *New = BI->clone();
-    New->setName(BI->getName());
-    SourceBlock->getInstList().push_back(New);
-    ValueMapping[BI] = New;
-  }
-
-  // Now that we have built the mapping information and cloned all of the
-  // instructions (giving us a new terminator, among other things), walk the new
-  // instructions, rewriting references of old instructions to use new
-  // instructions.
-  //
-  BI = Branch; ++BI;  // Get an iterator to the first new instruction
-  for (; BI != SourceBlock->end(); ++BI)
-    for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
-      std::map<Value*, Value*>::const_iterator I =
-        ValueMapping.find(BI->getOperand(i));
-      if (I != ValueMapping.end())
-        BI->setOperand(i, I->second);
-    }
-
-  // Next we check to see if any of the successors of DestBlock had PHI nodes.
-  // If so, we need to add entries to the PHI nodes for SourceBlock now.
-  for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock);
-       SI != SE; ++SI) {
-    BasicBlock *Succ = *SI;
-    for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) {
-      PHINode *PN = cast<PHINode>(PNI);
-      // Ok, we have a PHI node.  Figure out what the incoming value was for the
-      // DestBlock.
-      Value *IV = PN->getIncomingValueForBlock(DestBlock);
-
-      // Remap the value if necessary...
-      std::map<Value*, Value*>::const_iterator I = ValueMapping.find(IV);
-      if (I != ValueMapping.end())
-        IV = I->second;
-      PN->addIncoming(IV, SourceBlock);
-    }
-  }
-
-  // Next, remove the old branch instruction, and any PHI node entries that we
-  // had.
-  BI = Branch; ++BI;  // Get an iterator to the first new instruction
-  DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes...
-  SourceBlock->getInstList().erase(Branch);  // Destroy the uncond branch...
-
-  // Final step: now that we have finished everything up, walk the cloned
-  // instructions one last time, constant propagating and DCE'ing them, because
-  // they may not be needed anymore.
-  //
-  if (HadPHINodes) {
-    while (BI != SourceBlock->end()) {
-      Instruction *Inst = BI++;
-      if (isInstructionTriviallyDead(Inst))
-        Inst->eraseFromParent();
-      else if (Value *V = SimplifyInstruction(Inst)) {
-        Inst->replaceAllUsesWith(V);
-        Inst->eraseFromParent();
-      }
-    }
-  }
-
-  ++NumEliminated;  // We just killed a branch!
-}
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index be7bed1..8e5a1eb 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -222,7 +222,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     const TargetData *TD = TLI.getTargetData();
     gep_type_iterator GTI = gep_type_begin(AddrInst);
     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
-      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = TD->getStructLayout(STy);
         unsigned Idx =
           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
@@ -557,7 +557,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     Value *Address = User->getOperand(OpNo);
     if (!Address->getType()->isPointerTy())
       return false;
-    const Type *AddressAccessTy =
+    Type *AddressAccessTy =
       cast<PointerType>(Address->getType())->getElementType();
     
     // Do a match against the root of this address, ignoring profitability. This
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index b4f74f9..a7f9efd 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -287,7 +287,7 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
 ///
 BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
   BasicBlock::iterator SplitIt = SplitPt;
-  while (isa<PHINode>(SplitIt))
+  while (isa<PHINode>(SplitIt) || isa<LandingPadInst>(SplitIt))
     ++SplitIt;
   BasicBlock *New = Old->splitBasicBlock(SplitIt, Old->getName()+".split");
 
@@ -299,138 +299,114 @@ BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt, Pass *P) {
 
   if (DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>()) {
     // Old dominates New. New node dominates all other nodes dominated by Old.
-    DomTreeNode *OldNode = DT->getNode(Old);
-    std::vector<DomTreeNode *> Children;
-    for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
-         I != E; ++I) 
-      Children.push_back(*I);
+    if (DomTreeNode *OldNode = DT->getNode(Old)) {
+      std::vector<DomTreeNode *> Children;
+      for (DomTreeNode::iterator I = OldNode->begin(), E = OldNode->end();
+           I != E; ++I) 
+        Children.push_back(*I);
 
       DomTreeNode *NewNode = DT->addNewBlock(New,Old);
       for (std::vector<DomTreeNode *>::iterator I = Children.begin(),
              E = Children.end(); I != E; ++I) 
         DT->changeImmediateDominator(*I, NewNode);
+    }
   }
 
   return New;
 }
 
+/// UpdateAnalysisInformation - Update DominatorTree, LoopInfo, and LCCSA
+/// analysis information.
+static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
+                                      ArrayRef<BasicBlock *> Preds,
+                                      Pass *P, bool &HasLoopExit) {
+  if (!P) return;
 
-/// SplitBlockPredecessors - This method transforms BB by introducing a new
-/// basic block into the function, and moving some of the predecessors of BB to
-/// be predecessors of the new block.  The new predecessors are indicated by the
-/// Preds array, which has NumPreds elements in it.  The new block is given a
-/// suffix of 'Suffix'.
-///
-/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
-/// LoopInfo, and LCCSA but no other analyses. In particular, it does not
-/// preserve LoopSimplify (because it's complicated to handle the case where one
-/// of the edges being split is an exit of a loop with other exits).
-///
-BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
-                                         BasicBlock *const *Preds,
-                                         unsigned NumPreds, const char *Suffix,
-                                         Pass *P) {
-  // Create new basic block, insert right before the original block.
-  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
-                                         BB->getParent(), BB);
-  
-  // The new block unconditionally branches to the old block.
-  BranchInst *BI = BranchInst::Create(BB, NewBB);
-  
-  LoopInfo *LI = P ? P->getAnalysisIfAvailable<LoopInfo>() : 0;
-  Loop *L = LI ? LI->getLoopFor(BB) : 0;
-  bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID);
+  LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
+  Loop *L = LI ? LI->getLoopFor(OldBB) : 0;
 
-  // Move the edges from Preds to point to NewBB instead of BB.
-  // While here, if we need to preserve loop analyses, collect
-  // some information about how this split will affect loops.
-  bool HasLoopExit = false;
+  // If we need to preserve loop analyses, collect some information about how
+  // this split will affect loops.
   bool IsLoopEntry = !!L;
   bool SplitMakesNewLoopHeader = false;
-  for (unsigned i = 0; i != NumPreds; ++i) {
-    // This is slightly more strict than necessary; the minimum requirement
-    // is that there be no more than one indirectbr branching to BB. And
-    // all BlockAddress uses would need to be updated.
-    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
-           "Cannot split an edge from an IndirectBrInst");
-
-    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
-
-    if (LI) {
-      // If we need to preserve LCSSA, determine if any of
-      // the preds is a loop exit.
+  if (LI) {
+    bool PreserveLCSSA = P->mustPreserveAnalysisID(LCSSAID);
+    for (ArrayRef<BasicBlock*>::iterator
+           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
+      BasicBlock *Pred = *i;
+
+      // If we need to preserve LCSSA, determine if any of the preds is a loop
+      // exit.
       if (PreserveLCSSA)
-        if (Loop *PL = LI->getLoopFor(Preds[i]))
-          if (!PL->contains(BB))
+        if (Loop *PL = LI->getLoopFor(Pred))
+          if (!PL->contains(OldBB))
             HasLoopExit = true;
-      // If we need to preserve LoopInfo, note whether any of the
-      // preds crosses an interesting loop boundary.
-      if (L) {
-        if (L->contains(Preds[i]))
-          IsLoopEntry = false;
-        else
-          SplitMakesNewLoopHeader = true;
-      }
+
+      // If we need to preserve LoopInfo, note whether any of the preds crosses
+      // an interesting loop boundary.
+      if (!L) continue;
+      if (L->contains(Pred))
+        IsLoopEntry = false;
+      else
+        SplitMakesNewLoopHeader = true;
     }
   }
 
   // Update dominator tree if available.
-  DominatorTree *DT = P ? P->getAnalysisIfAvailable<DominatorTree>() : 0;
+  DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
   if (DT)
     DT->splitBlock(NewBB);
 
-  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
-  // node becomes an incoming value for BB's phi node.  However, if the Preds
-  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
-  // account for the newly created predecessor.
-  if (NumPreds == 0) {
-    // Insert dummy values as the incoming value.
-    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
-      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
-    return NewBB;
+  if (!L) return;
+
+  if (IsLoopEntry) {
+    // Add the new block to the nearest enclosing loop (and not an adjacent
+    // loop). To find this, examine each of the predecessors and determine which
+    // loops enclose them, and select the most-nested loop which contains the
+    // loop containing the block being split.
+    Loop *InnermostPredLoop = 0;
+    for (ArrayRef<BasicBlock*>::iterator
+           i = Preds.begin(), e = Preds.end(); i != e; ++i) {
+      BasicBlock *Pred = *i;
+      if (Loop *PredLoop = LI->getLoopFor(Pred)) {
+        // Seek a loop which actually contains the block being split (to avoid
+        // adjacent loops).
+        while (PredLoop && !PredLoop->contains(OldBB))
+          PredLoop = PredLoop->getParentLoop();
+
+        // Select the most-nested of these loops which contains the block.
+        if (PredLoop && PredLoop->contains(OldBB) &&
+            (!InnermostPredLoop ||
+             InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
+          InnermostPredLoop = PredLoop;
+      }
+    }
+
+    if (InnermostPredLoop)
+      InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
+  } else {
+    L->addBasicBlockToLoop(NewBB, LI->getBase());
+    if (SplitMakesNewLoopHeader)
+      L->moveToHeader(NewBB);
   }
+}
 
+/// UpdatePHINodes - Update the PHI nodes in OrigBB to include the values coming
+/// from NewBB. This also updates AliasAnalysis, if available.
+static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
+                           ArrayRef<BasicBlock*> Preds, BranchInst *BI,
+                           Pass *P, bool HasLoopExit) {
+  // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
   AliasAnalysis *AA = P ? P->getAnalysisIfAvailable<AliasAnalysis>() : 0;
-
-  if (L) {
-    if (IsLoopEntry) {
-      // Add the new block to the nearest enclosing loop (and not an
-      // adjacent loop). To find this, examine each of the predecessors and
-      // determine which loops enclose them, and select the most-nested loop
-      // which contains the loop containing the block being split.
-      Loop *InnermostPredLoop = 0;
-      for (unsigned i = 0; i != NumPreds; ++i)
-        if (Loop *PredLoop = LI->getLoopFor(Preds[i])) {
-          // Seek a loop which actually contains the block being split (to
-          // avoid adjacent loops).
-          while (PredLoop && !PredLoop->contains(BB))
-            PredLoop = PredLoop->getParentLoop();
-          // Select the most-nested of these loops which contains the block.
-          if (PredLoop &&
-              PredLoop->contains(BB) &&
-              (!InnermostPredLoop ||
-               InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
-            InnermostPredLoop = PredLoop;
-        }
-      if (InnermostPredLoop)
-        InnermostPredLoop->addBasicBlockToLoop(NewBB, LI->getBase());
-    } else {
-      L->addBasicBlockToLoop(NewBB, LI->getBase());
-      if (SplitMakesNewLoopHeader)
-        L->moveToHeader(NewBB);
-    }
-  }
-  
-  // Otherwise, create a new PHI node in NewBB for each PHI node in BB.
-  for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ) {
+  for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
     PHINode *PN = cast<PHINode>(I++);
-    
+
     // Check to see if all of the values coming in are the same.  If so, we
     // don't need to create a new PHI node, unless it's needed for LCSSA.
     Value *InVal = 0;
     if (!HasLoopExit) {
       InVal = PN->getIncomingValueForBlock(Preds[0]);
-      for (unsigned i = 1; i != NumPreds; ++i)
+      for (unsigned i = 1, e = Preds.size(); i != e; ++i)
         if (InVal != PN->getIncomingValueForBlock(Preds[i])) {
           InVal = 0;
           break;
@@ -441,31 +417,191 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
       // If all incoming values for the new PHI would be the same, just don't
       // make a new PHI.  Instead, just remove the incoming values from the old
       // PHI.
-      for (unsigned i = 0; i != NumPreds; ++i)
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i)
         PN->removeIncomingValue(Preds[i], false);
     } else {
       // If the values coming into the block are not the same, we need a PHI.
       // Create the new PHI node, insert it into NewBB at the end of the block
       PHINode *NewPHI =
-        PHINode::Create(PN->getType(), NumPreds, PN->getName()+".ph", BI);
+        PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
       if (AA) AA->copyValue(PN, NewPHI);
       
       // Move all of the PHI values for 'Preds' to the new PHI.
-      for (unsigned i = 0; i != NumPreds; ++i) {
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
         Value *V = PN->removeIncomingValue(Preds[i], false);
         NewPHI->addIncoming(V, Preds[i]);
       }
+
       InVal = NewPHI;
     }
-    
+
     // Add an incoming value to the PHI node in the loop for the preheader
     // edge.
     PN->addIncoming(InVal, NewBB);
   }
+}
+
+/// SplitBlockPredecessors - This method transforms BB by introducing a new
+/// basic block into the function, and moving some of the predecessors of BB to
+/// be predecessors of the new block.  The new predecessors are indicated by the
+/// Preds array, which has NumPreds elements in it.  The new block is given a
+/// suffix of 'Suffix'.
+///
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
+/// LoopInfo, and LCCSA but no other analyses. In particular, it does not
+/// preserve LoopSimplify (because it's complicated to handle the case where one
+/// of the edges being split is an exit of a loop with other exits).
+///
+BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB, 
+                                         BasicBlock *const *Preds,
+                                         unsigned NumPreds, const char *Suffix,
+                                         Pass *P) {
+  // Create new basic block, insert right before the original block.
+  BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), BB->getName()+Suffix,
+                                         BB->getParent(), BB);
   
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI = BranchInst::Create(BB, NewBB);
+  
+  // Move the edges from Preds to point to NewBB instead of BB.
+  for (unsigned i = 0; i != NumPreds; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+  }
+
+  // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+  // node becomes an incoming value for BB's phi node.  However, if the Preds
+  // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+  // account for the newly created predecessor.
+  if (NumPreds == 0) {
+    // Insert dummy values as the incoming value.
+    for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+    return NewBB;
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(BB, NewBB, ArrayRef<BasicBlock*>(Preds, NumPreds),
+                            P, HasLoopExit);
+
+  // Update the PHI nodes in BB with the values coming from NewBB.
+  UpdatePHINodes(BB, NewBB, ArrayRef<BasicBlock*>(Preds, NumPreds), BI,
+                 P, HasLoopExit);
   return NewBB;
 }
 
+/// SplitLandingPadPredecessors - This method transforms the landing pad,
+/// OrigBB, by introducing two new basic blocks into the function. One of those
+/// new basic blocks gets the predecessors listed in Preds. The other basic
+/// block gets the remaining predecessors of OrigBB. The landingpad instruction
+/// OrigBB is clone into both of the new basic blocks. The new blocks are given
+/// the suffixes 'Suffix1' and 'Suffix2', and are returned in the NewBBs vector.
+/// 
+/// This currently updates the LLVM IR, AliasAnalysis, DominatorTree,
+/// DominanceFrontier, LoopInfo, and LCCSA but no other analyses. In particular,
+/// it does not preserve LoopSimplify (because it's complicated to handle the
+/// case where one of the edges being split is an exit of a loop with other
+/// exits).
+/// 
+void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
+                                       ArrayRef<BasicBlock*> Preds,
+                                       const char *Suffix1, const char *Suffix2,
+                                       Pass *P,
+                                       SmallVectorImpl<BasicBlock*> &NewBBs) {
+  assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
+
+  // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
+  // it right before the original block.
+  BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(),
+                                          OrigBB->getName() + Suffix1,
+                                          OrigBB->getParent(), OrigBB);
+  NewBBs.push_back(NewBB1);
+
+  // The new block unconditionally branches to the old block.
+  BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
+
+  // Move the edges from Preds to point to NewBB1 instead of OrigBB.
+  for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+    // This is slightly more strict than necessary; the minimum requirement
+    // is that there be no more than one indirectbr branching to BB. And
+    // all BlockAddress uses would need to be updated.
+    assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
+  }
+
+  // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+  bool HasLoopExit = false;
+  UpdateAnalysisInformation(OrigBB, NewBB1, Preds, P, HasLoopExit);
+
+  // Update the PHI nodes in OrigBB with the values coming from NewBB1.
+  UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, P, HasLoopExit);
+
+  // Move the remaining edges from OrigBB to point to NewBB2.
+  SmallVector<BasicBlock*, 8> NewBB2Preds;
+  for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB);
+       i != e; ) {
+    BasicBlock *Pred = *i++;
+    if (Pred == NewBB1) continue;
+    assert(!isa<IndirectBrInst>(Pred->getTerminator()) &&
+           "Cannot split an edge from an IndirectBrInst");
+    NewBB2Preds.push_back(Pred);
+    e = pred_end(OrigBB);
+  }
+
+  BasicBlock *NewBB2 = 0;
+  if (!NewBB2Preds.empty()) {
+    // Create another basic block for the rest of OrigBB's predecessors.
+    NewBB2 = BasicBlock::Create(OrigBB->getContext(),
+                                OrigBB->getName() + Suffix2,
+                                OrigBB->getParent(), OrigBB);
+    NewBBs.push_back(NewBB2);
+
+    // The new block unconditionally branches to the old block.
+    BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
+
+    // Move the remaining edges from OrigBB to point to NewBB2.
+    for (SmallVectorImpl<BasicBlock*>::iterator
+           i = NewBB2Preds.begin(), e = NewBB2Preds.end(); i != e; ++i)
+      (*i)->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+
+    // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+    HasLoopExit = false;
+    UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, P, HasLoopExit);
+
+    // Update the PHI nodes in OrigBB with the values coming from NewBB2.
+    UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, P, HasLoopExit);
+  }
+
+  LandingPadInst *LPad = OrigBB->getLandingPadInst();
+  Instruction *Clone1 = LPad->clone();
+  Clone1->setName(Twine("lpad") + Suffix1);
+  NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1);
+
+  if (NewBB2) {
+    Instruction *Clone2 = LPad->clone();
+    Clone2->setName(Twine("lpad") + Suffix2);
+    NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
+
+    // Create a PHI node for the two cloned landingpad instructions.
+    PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+    PN->addIncoming(Clone1, NewBB1);
+    PN->addIncoming(Clone2, NewBB2);
+    LPad->replaceAllUsesWith(PN);
+    LPad->eraseFromParent();
+  } else {
+    // There is no second clone. Just replace the landing pad with the first
+    // clone.
+    LPad->replaceAllUsesWith(Clone1);
+    LPad->eraseFromParent();
+  }
+}
+
 /// FindFunctionBackedges - Analyze the specified function to find all of the
 /// loop backedges in the function and return them.  This is a relatively cheap
 /// (compared to computing dominators and loop info) analysis.
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 92ce500..c052910 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -102,7 +102,7 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
   ++I;        // Skip one edge due to the incoming arc from TI.
   if (!AllowIdenticalEdges)
     return I != E;
-  
+
   // If AllowIdenticalEdges is true, then we allow this edge to be considered
   // non-critical iff all preds come from TI's block.
   while (I != E) {
@@ -155,10 +155,10 @@ static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds,
 /// This returns the new block if the edge was split, null otherwise.
 ///
 /// If MergeIdenticalEdges is true (not the default), *all* edges from TI to the
-/// specified successor will be merged into the same critical edge block.  
-/// This is most commonly interesting with switch instructions, which may 
+/// specified successor will be merged into the same critical edge block.
+/// This is most commonly interesting with switch instructions, which may
 /// have many edges to any one destination.  This ensures that all edges to that
-/// dest go to one block instead of each going to a different block, but isn't 
+/// dest go to one block instead of each going to a different block, but isn't
 /// the standard definition of a "critical edge".
 ///
 /// It is invalid to call this function on a critical edge that starts at an
@@ -167,15 +167,20 @@ static void CreatePHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds,
 /// to.
 ///
 BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
-                                    Pass *P, bool MergeIdenticalEdges) {
+                                    Pass *P, bool MergeIdenticalEdges,
+                                    bool DontDeleteUselessPhis) {
   if (!isCriticalEdge(TI, SuccNum, MergeIdenticalEdges)) return 0;
-  
+
   assert(!isa<IndirectBrInst>(TI) &&
          "Cannot split critical edge from IndirectBrInst");
-  
+
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (DestBB->isLandingPad()) return 0;
+
   // Create a new basic block, linking it into the CFG.
   BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
                       TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
@@ -190,7 +195,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   Function &F = *TIBB->getParent();
   Function::iterator FBBI = TIBB;
   F.getBasicBlockList().insert(++FBBI, NewBB);
-  
+
   // If there are any PHI nodes in DestBB, we need to update them so that they
   // merge incoming values from NewBB instead of from TIBB.
   {
@@ -207,35 +212,35 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
       // happens because the BB list of PHI nodes are usually in the same
       // order.
       if (PN->getIncomingBlock(BBIdx) != TIBB)
-	BBIdx = PN->getBasicBlockIndex(TIBB);
+        BBIdx = PN->getBasicBlockIndex(TIBB);
       PN->setIncomingBlock(BBIdx, NewBB);
     }
   }
-   
+
   // If there are any other edges from TIBB to DestBB, update those to go
   // through the split block, making those edges non-critical as well (and
   // reducing the number of phi entries in the DestBB if relevant).
   if (MergeIdenticalEdges) {
     for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
       if (TI->getSuccessor(i) != DestBB) continue;
-      
+
       // Remove an entry for TIBB from DestBB phi nodes.
-      DestBB->removePredecessor(TIBB);
-      
+      DestBB->removePredecessor(TIBB, DontDeleteUselessPhis);
+
       // We found another edge to DestBB, go to NewBB instead.
       TI->setSuccessor(i, NewBB);
     }
   }
-  
-  
+
+
 
   // If we don't have a pass object, we can't update anything...
   if (P == 0) return NewBB;
-  
+
   DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
   LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
   ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
-  
+
   // If we have nothing to update, just return.
   if (DT == 0 && LI == 0 && PI == 0)
     return NewBB;
@@ -263,7 +268,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
   }
 
   bool NewBBDominatesDestBB = true;
-  
+
   // Should we update DominatorTree information?
   if (DT) {
     DomTreeNode *TINode = DT->getNode(TIBB);
@@ -274,7 +279,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
     if (TINode) {       // Don't break unreachable code!
       DomTreeNode *NewBBNode = DT->addNewBlock(NewBB, TIBB);
       DomTreeNode *DestBBNode = 0;
-     
+
       // If NewBBDominatesDestBB hasn't been computed yet, do so with DT.
       if (!OtherPreds.empty()) {
         DestBBNode = DT->getNode(DestBB);
@@ -285,7 +290,7 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
         }
         OtherPreds.clear();
       }
-      
+
       // If NewBBDominatesDestBB, then NewBB dominates DestBB, otherwise it
       // doesn't dominate anything.
       if (NewBBDominatesDestBB) {
@@ -337,6 +342,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
         }
 
         // For each unique exit block...
+        // FIXME: This code is functionally equivalent to the corresponding
+        // loop in LoopSimplify.
         SmallVector<BasicBlock *, 4> ExitBlocks;
         TIL->getExitBlocks(ExitBlocks);
         for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
@@ -348,10 +355,15 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
           for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit);
                I != E; ++I) {
             BasicBlock *P = *I;
-            if (TIL->contains(P))
+            if (TIL->contains(P)) {
+              if (isa<IndirectBrInst>(P->getTerminator())) {
+                Preds.clear();
+                break;
+              }
               Preds.push_back(P);
-            else
+            } else {
               HasPredOutsideOfLoop = true;
+            }
           }
           // If there are any preds not in the loop, we'll need to split
           // the edges. The Preds.empty() check is needed because a block
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 14bb17f..4b5f45b 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -58,8 +58,8 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
   AttributeWithIndex AWI =
     AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
 
-  const Type *I8Ptr = B.getInt8PtrTy();
-  const Type *I32Ty = B.getInt32Ty();
+  Type *I8Ptr = B.getInt8PtrTy();
+  Type *I32Ty = B.getInt32Ty();
   Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
@@ -102,7 +102,7 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
-  const Type *I8Ptr = B.getInt8PtrTy();
+  Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -120,7 +120,7 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
-  const Type *I8Ptr = B.getInt8PtrTy();
+  Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
@@ -361,7 +361,7 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) {
   this->CI = CI;
   Function *Callee = CI->getCalledFunction();
   StringRef Name = Callee->getName();
-  const FunctionType *FT = Callee->getFunctionType();
+  FunctionType *FT = Callee->getFunctionType();
   LLVMContext &Context = CI->getParent()->getContext();
   IRBuilder<> B(CI);
 
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 204c2c6..7adc5f1 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -21,9 +21,17 @@ add_llvm_library(LLVMTransformUtils
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
   SimplifyCFG.cpp
+  SimplifyIndVar.cpp
   SimplifyInstructions.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
   )
 
+add_llvm_library_dependencies(LLVMTransformUtils
+  LLVMAnalysis
+  LLVMCore
+  LLVMSupport
+  LLVMTarget
+  LLVMipa
+  )
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 6ea831f..cf21f1e 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -331,15 +331,10 @@ ConstantFoldMappedInstruction(const Instruction *I) {
                                            TD);
 
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
-      if (!LI->isVolatile() && CE->getOpcode() == Instruction::GetElementPtr)
-        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
-          if (GV->isConstant() && GV->hasDefinitiveInitializer())
-            return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(),
-                                                          CE);
-
-  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), &Ops[0],
-                                  Ops.size(), TD);
+    if (!LI->isVolatile())
+      return ConstantFoldLoadFromConstPtr(Ops[0], TD);
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD);
 }
 
 /// CloneAndPruneFunctionInto - This works exactly like CloneFunctionInto,
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index a08fa35..a0e027b 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -50,10 +50,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
        I != E; ++I) {
     GlobalVariable *GV = new GlobalVariable(*New, 
                                             I->getType()->getElementType(),
-                                            false,
-                                            GlobalValue::ExternalLinkage, 0,
-                                            I->getName());
-    GV->setAlignment(I->getAlignment());
+                                            I->isConstant(), I->getLinkage(),
+                                            (Constant*) 0, I->getName(),
+                                            (GlobalVariable*) 0,
+                                            I->isThreadLocal(),
+                                            I->getType()->getAddressSpace());
+    GV->copyAttributesFrom(I);
     VMap[I] = GV;
   }
 
@@ -61,16 +63,19 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I) {
     Function *NF =
       Function::Create(cast<FunctionType>(I->getType()->getElementType()),
-                       GlobalValue::ExternalLinkage, I->getName(), New);
+                       I->getLinkage(), I->getName(), New);
     NF->copyAttributesFrom(I);
     VMap[I] = NF;
   }
 
   // Loop over the aliases in the module
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
-       I != E; ++I)
-    VMap[I] = new GlobalAlias(I->getType(), GlobalAlias::ExternalLinkage,
-                                  I->getName(), NULL, New);
+       I != E; ++I) {
+    GlobalAlias *GA = new GlobalAlias(I->getType(), I->getLinkage(),
+                                      I->getName(), NULL, New);
+    GA->copyAttributesFrom(I);
+    VMap[I] = GA;
+  }
   
   // Now that all of the things that global variable initializer can refer to
   // have been created, loop through and copy the global variable referrers
@@ -81,9 +86,6 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
     GlobalVariable *GV = cast<GlobalVariable>(VMap[I]);
     if (I->hasInitializer())
       GV->setInitializer(MapValue(I->getInitializer(), VMap));
-    GV->setLinkage(I->getLinkage());
-    GV->setThreadLocal(I->isThreadLocal());
-    GV->setConstant(I->isConstant());
   }
 
   // Similarly, copy over function bodies now...
@@ -101,15 +103,12 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
       SmallVector<ReturnInst*, 8> Returns;  // Ignore returns cloned.
       CloneFunctionInto(F, I, VMap, /*ModuleLevelChanges=*/true, Returns);
     }
-
-    F->setLinkage(I->getLinkage());
   }
 
   // And aliases
   for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
        I != E; ++I) {
     GlobalAlias *GA = cast<GlobalAlias>(VMap[I]);
-    GA->setLinkage(I->getLinkage());
     if (const Constant *C = I->getAliasee())
       GA->setAliasee(MapValue(C, VMap));
   }
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 0813523..5f47ebb 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -50,14 +50,14 @@ namespace {
     DominatorTree* DT;
     bool AggregateArgs;
     unsigned NumExitBlocks;
-    const Type *RetTy;
+    Type *RetTy;
   public:
     CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false)
       : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {}
 
-    Function *ExtractCodeRegion(const std::vector<BasicBlock*> &code);
+    Function *ExtractCodeRegion(ArrayRef<BasicBlock*> code);
 
-    bool isEligible(const std::vector<BasicBlock*> &code);
+    bool isEligible(ArrayRef<BasicBlock*> code);
 
   private:
     /// definedInRegion - Return true if the specified value is defined in the
@@ -290,7 +290,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
     paramTy.clear();
     paramTy.push_back(StructPtr);
   }
-  const FunctionType *funcType =
+  FunctionType *funcType =
                   FunctionType::get(RetTy, paramTy, false);
 
   // Create the new function
@@ -317,8 +317,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs,
       Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
       TerminatorInst *TI = newFunction->begin()->getTerminator();
       GetElementPtrInst *GEP = 
-        GetElementPtrInst::Create(AI, Idx, Idx+2, 
-                                  "gep_" + inputs[i]->getName(), TI);
+        GetElementPtrInst::Create(AI, Idx, "gep_" + inputs[i]->getName(), TI);
       RewriteVal = new LoadInst(GEP, "loadgep_" + inputs[i]->getName(), TI);
     } else
       RewriteVal = AI++;
@@ -420,7 +419,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
       GetElementPtrInst *GEP =
-        GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+        GetElementPtrInst::Create(Struct, Idx,
                                   "gep_" + StructValues[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
       StoreInst *SI = new StoreInst(StructValues[i], GEP);
@@ -446,7 +445,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
       Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
       GetElementPtrInst *GEP
-        = GetElementPtrInst::Create(Struct, Idx, Idx + 2,
+        = GetElementPtrInst::Create(Struct, Idx,
                                     "gep_reload_" + outputs[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
       Output = GEP;
@@ -561,7 +560,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
                 Idx[1] = ConstantInt::get(Type::getInt32Ty(Context),
                                           FirstOut+out);
                 GetElementPtrInst *GEP =
-                  GetElementPtrInst::Create(OAI, Idx, Idx + 2,
+                  GetElementPtrInst::Create(OAI, Idx,
                                             "gep_" + outputs[out]->getName(),
                                             NTRet);
                 new StoreInst(outputs[out], GEP, NTRet);
@@ -580,7 +579,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   }
 
   // Now that we've done the deed, simplify the switch instruction.
-  const Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+  Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
   switch (NumExitBlocks) {
   case 0:
     // There are no successors (the block containing the switch itself), which
@@ -655,7 +654,7 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) {
 /// computed result back into memory.
 ///
 Function *CodeExtractor::
-ExtractCodeRegion(const std::vector<BasicBlock*> &code) {
+ExtractCodeRegion(ArrayRef<BasicBlock*> code) {
   if (!isEligible(code))
     return 0;
 
@@ -755,9 +754,13 @@ ExtractCodeRegion(const std::vector<BasicBlock*> &code) {
   return newFunction;
 }
 
-bool CodeExtractor::isEligible(const std::vector<BasicBlock*> &code) {
+bool CodeExtractor::isEligible(ArrayRef<BasicBlock*> code) {
+  // Deny a single basic block that's a landing pad block.
+  if (code.size() == 1 && code[0]->isLandingPad())
+    return false;
+
   // Deny code region if it contains allocas or vastarts.
-  for (std::vector<BasicBlock*>::const_iterator BB = code.begin(), e=code.end();
+  for (ArrayRef<BasicBlock*>::iterator BB = code.begin(), e=code.end();
        BB != e; ++BB)
     for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end();
          I != Ie; ++I)
@@ -771,25 +774,23 @@ bool CodeExtractor::isEligible(const std::vector<BasicBlock*> &code) {
 }
 
 
-/// ExtractCodeRegion - slurp a sequence of basic blocks into a brand new
-/// function
+/// ExtractCodeRegion - Slurp a sequence of basic blocks into a brand new
+/// function.
 ///
 Function* llvm::ExtractCodeRegion(DominatorTree &DT,
-                                  const std::vector<BasicBlock*> &code,
+                                  ArrayRef<BasicBlock*> code,
                                   bool AggregateArgs) {
   return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code);
 }
 
-/// ExtractBasicBlock - slurp a natural loop into a brand new function
+/// ExtractLoop - Slurp a natural loop into a brand new function.
 ///
 Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) {
   return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks());
 }
 
-/// ExtractBasicBlock - slurp a basic block into a brand new function
+/// ExtractBasicBlock - Slurp a basic block into a brand new function.
 ///
-Function* llvm::ExtractBasicBlock(BasicBlock *BB, bool AggregateArgs) {
-  std::vector<BasicBlock*> Blocks;
-  Blocks.push_back(BB);
-  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(Blocks);
+Function* llvm::ExtractBasicBlock(ArrayRef<BasicBlock*> BBs, bool AggregateArgs){
+  return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(BBs);
 }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index d5b382e..5464dbc 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -45,6 +45,9 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI) {
   return InlineFunction(CallSite(II), IFI);
 }
 
+// FIXME: New EH - Remove the functions marked [LIBUNWIND] when new EH is
+// turned on.
+
 /// [LIBUNWIND] Look for an llvm.eh.exception call in the given block.
 static EHExceptionInst *findExceptionInBlock(BasicBlock *bb) {
   for (BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; i++) {
@@ -250,20 +253,32 @@ namespace {
     PHINode *InnerSelectorPHI;
     SmallVector<Value*, 8> UnwindDestPHIValues;
 
+    // FIXME: New EH - These will replace the analogous ones above.
+    BasicBlock *OuterResumeDest; //< Destination of the invoke's unwind.
+    BasicBlock *InnerResumeDest; //< Destination for the callee's resume.
+    LandingPadInst *CallerLPad;  //< LandingPadInst associated with the invoke.
+    PHINode *InnerEHValuesPHI;   //< PHI for EH values from landingpad insts.
+
   public:
-    InvokeInliningInfo(InvokeInst *II) :
-      OuterUnwindDest(II->getUnwindDest()), OuterSelector(0),
-      InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0) {
-
-      // If there are PHI nodes in the unwind destination block, we
-      // need to keep track of which values came into them from the
-      // invoke before removing the edge from this block.
-      llvm::BasicBlock *invokeBB = II->getParent();
-      for (BasicBlock::iterator I = OuterUnwindDest->begin();
-             isa<PHINode>(I); ++I) {
+    InvokeInliningInfo(InvokeInst *II)
+      : OuterUnwindDest(II->getUnwindDest()), OuterSelector(0),
+        InnerUnwindDest(0), InnerExceptionPHI(0), InnerSelectorPHI(0),
+        OuterResumeDest(II->getUnwindDest()), InnerResumeDest(0),
+        CallerLPad(0), InnerEHValuesPHI(0) {
+      // If there are PHI nodes in the unwind destination block, we need to keep
+      // track of which values came into them from the invoke before removing
+      // the edge from this block.
+      llvm::BasicBlock *InvokeBB = II->getParent();
+      BasicBlock::iterator I = OuterUnwindDest->begin();
+      for (; isa<PHINode>(I); ++I) {
         // Save the value to use for this edge.
-        PHINode *phi = cast<PHINode>(I);
-        UnwindDestPHIValues.push_back(phi->getIncomingValueForBlock(invokeBB));
+        PHINode *PHI = cast<PHINode>(I);
+        UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+      }
+
+      // FIXME: With the new EH, this if/dyn_cast should be a 'cast'.
+      if (LandingPadInst *LPI = dyn_cast<LandingPadInst>(I)) {
+        CallerLPad = LPI;
       }
     }
 
@@ -281,11 +296,23 @@ namespace {
 
     BasicBlock *getInnerUnwindDest();
 
+    // FIXME: New EH - Rename when new EH is turned on.
+    BasicBlock *getInnerUnwindDestNewEH();
+
+    LandingPadInst *getLandingPadInst() const { return CallerLPad; }
+
     bool forwardEHResume(CallInst *call, BasicBlock *src);
 
-    /// Add incoming-PHI values to the unwind destination block for
-    /// the given basic block, using the values for the original
-    /// invoke's source block.
+    /// forwardResume - Forward the 'resume' instruction to the caller's landing
+    /// pad block. When the landing pad block has only one predecessor, this is
+    /// a simple branch. When there is more than one predecessor, we need to
+    /// split the landing pad block after the landingpad instruction and jump
+    /// to there.
+    void forwardResume(ResumeInst *RI);
+
+    /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
+    /// destination block for the given basic block, using the values for the
+    /// original invoke's source block.
     void addIncomingPHIValuesFor(BasicBlock *BB) const {
       addIncomingPHIValuesForInto(BB, OuterUnwindDest);
     }
@@ -300,7 +327,7 @@ namespace {
   };
 }
 
-/// Get or create a target for the branch out of rewritten calls to
+/// [LIBUNWIND] Get or create a target for the branch out of rewritten calls to
 /// llvm.eh.resume.
 BasicBlock *InvokeInliningInfo::getInnerUnwindDest() {
   if (InnerUnwindDest) return InnerUnwindDest;
@@ -404,6 +431,60 @@ bool InvokeInliningInfo::forwardEHResume(CallInst *call, BasicBlock *src) {
   return true;
 }
 
+/// Get or create a target for the branch from ResumeInsts.
+BasicBlock *InvokeInliningInfo::getInnerUnwindDestNewEH() {
+  // FIXME: New EH - rename this function when new EH is turned on.
+  if (InnerResumeDest) return InnerResumeDest;
+
+  // Split the landing pad.
+  BasicBlock::iterator SplitPoint = CallerLPad; ++SplitPoint;
+  InnerResumeDest =
+    OuterResumeDest->splitBasicBlock(SplitPoint,
+                                     OuterResumeDest->getName() + ".body");
+
+  // The number of incoming edges we expect to the inner landing pad.
+  const unsigned PHICapacity = 2;
+
+  // Create corresponding new PHIs for all the PHIs in the outer landing pad.
+  BasicBlock::iterator InsertPoint = InnerResumeDest->begin();
+  BasicBlock::iterator I = OuterResumeDest->begin();
+  for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+    PHINode *OuterPHI = cast<PHINode>(I);
+    PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity,
+                                        OuterPHI->getName() + ".lpad-body",
+                                        InsertPoint);
+    OuterPHI->replaceAllUsesWith(InnerPHI);
+    InnerPHI->addIncoming(OuterPHI, OuterResumeDest);
+  }
+
+  // Create a PHI for the exception values.
+  InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity,
+                                     "eh.lpad-body", InsertPoint);
+  CallerLPad->replaceAllUsesWith(InnerEHValuesPHI);
+  InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest);
+
+  // All done.
+  return InnerResumeDest;
+}
+
+/// forwardResume - Forward the 'resume' instruction to the caller's landing pad
+/// block. When the landing pad block has only one predecessor, this is a simple
+/// branch. When there is more than one predecessor, we need to split the
+/// landing pad block after the landingpad instruction and jump to there.
+void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
+  BasicBlock *Dest = getInnerUnwindDestNewEH();
+  BasicBlock *Src = RI->getParent();
+
+  BranchInst::Create(Dest, Src);
+
+  // Update the PHIs in the destination. They were inserted in an order which
+  // makes this work.
+  addIncomingPHIValuesForInto(Src, Dest);
+
+  InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
+  RI->eraseFromParent();
+}
+
 /// [LIBUNWIND] Check whether this selector is "only cleanups":
 ///   call i32 @llvm.eh.selector(blah, blah, i32 0)
 static bool isCleanupOnlySelector(EHSelectorInst *selector) {
@@ -421,9 +502,19 @@ static bool isCleanupOnlySelector(EHSelectorInst *selector) {
 /// Returns true to indicate that the next block should be skipped.
 static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
                                                    InvokeInliningInfo &Invoke) {
+  LandingPadInst *LPI = Invoke.getLandingPadInst();
+
   for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
     Instruction *I = BBI++;
-    
+
+    if (LPI) // FIXME: New EH - This won't be NULL in the new EH.
+      if (LandingPadInst *L = dyn_cast<LandingPadInst>(I)) {
+        unsigned NumClauses = LPI->getNumClauses();
+        L->reserveClauses(NumClauses);
+        for (unsigned i = 0; i != NumClauses; ++i)
+          L->addClause(LPI->getClause(i));
+      }
+
     // We only need to check for function calls: inlined invoke
     // instructions require no special handling.
     CallInst *CI = dyn_cast<CallInst>(I);
@@ -557,6 +648,10 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
       // there is now a new entry in them.
       Invoke.addIncomingPHIValuesFor(BB);
     }
+
+    if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+      Invoke.forwardResume(RI);
+    }
   }
 
   // Now that everything is happy, we have one final detail.  The PHI nodes in
@@ -636,7 +731,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
                                   const Function *CalledFunc,
                                   InlineFunctionInfo &IFI,
                                   unsigned ByValAlignment) {
-  const Type *AggTy = cast<PointerType>(Arg->getType())->getElementType();
+  Type *AggTy = cast<PointerType>(Arg->getType())->getElementType();
 
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
@@ -726,7 +821,7 @@ static bool isUsedByLifetimeMarker(Value *V) {
 // hasLifetimeMarkers - Check whether the given alloca already has
 // lifetime.start or lifetime.end intrinsics.
 static bool hasLifetimeMarkers(AllocaInst *AI) {
-  const Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
+  Type *Int8PtrTy = Type::getInt8PtrTy(AI->getType()->getContext());
   if (AI->getType() == Int8PtrTy)
     return isUsedByLifetimeMarker(AI);
 
@@ -770,8 +865,15 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
     for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
          BI != BE; ++BI) {
       DebugLoc DL = BI->getDebugLoc();
-      if (!DL.isUnknown())
+      if (!DL.isUnknown()) {
         BI->setDebugLoc(updateInlinedAtInfo(DL, TheCallDL, BI->getContext()));
+        if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(BI)) {
+          LLVMContext &Ctx = BI->getContext();
+          MDNode *InlinedAt = BI->getDebugLoc().getInlinedAt(Ctx);
+          DVI->setOperand(2, createInlinedVariable(DVI->getVariable(), 
+                                                   InlinedAt, Ctx));
+        }
+      }
     }
   }
 }
@@ -822,6 +924,40 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
       return false;
   }
 
+  // Find the personality function used by the landing pads of the caller. If it
+  // exists, then check to see that it matches the personality function used in
+  // the callee.
+  for (Function::const_iterator
+         I = Caller->begin(), E = Caller->end(); I != E; ++I)
+    if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
+      const BasicBlock *BB = II->getUnwindDest();
+      // FIXME: This 'isa' here should become go away once the new EH system is
+      // in place.
+      if (!isa<LandingPadInst>(BB->getFirstNonPHI()))
+        continue;
+      const LandingPadInst *LP = cast<LandingPadInst>(BB->getFirstNonPHI());
+      const Value *CallerPersFn = LP->getPersonalityFn();
+
+      // If the personality functions match, then we can perform the
+      // inlining. Otherwise, we can't inline.
+      // TODO: This isn't 100% true. Some personality functions are proper
+      //       supersets of others and can be used in place of the other.
+      for (Function::const_iterator
+             I = CalledFunc->begin(), E = CalledFunc->end(); I != E; ++I)
+        if (const InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator())) {
+          const BasicBlock *BB = II->getUnwindDest();
+          // FIXME: This 'if/dyn_cast' here should become a normal 'cast' once
+          // the new EH system is in place.
+          if (const LandingPadInst *LP =
+              dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
+            if (CallerPersFn != LP->getPersonalityFn())
+              return false;
+          break;
+        }
+
+      break;
+    }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
   //
@@ -1090,7 +1226,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
 
   // Handle all of the return instructions that we just cloned in, and eliminate
   // any users of the original call/invoke instruction.
-  const Type *RTy = CalledFunc->getReturnType();
+  Type *RTy = CalledFunc->getReturnType();
 
   PHINode *PHI = 0;
   if (Returns.size() > 1) {
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0f6d9ae..7034feb 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Analysis/DIBuilder.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -227,13 +226,17 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
 bool llvm::isInstructionTriviallyDead(Instruction *I) {
   if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
 
+  // We don't want the landingpad instruction removed by anything this general.
+  if (isa<LandingPadInst>(I))
+    return false;
+
   // We don't want debug info removed by anything this general, unless
   // debug info is empty.
   if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
-    if (DDI->getAddress()) 
+    if (DDI->getAddress())
       return false;
     return true;
-  } 
+  }
   if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
     if (DVI->getValue())
       return false;
@@ -244,10 +247,16 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
 
   // Special case intrinsics that "may have side effects" but can be deleted
   // when dead.
-  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
     // Safe to delete llvm.stacksave if dead.
     if (II->getIntrinsicID() == Intrinsic::stacksave)
       return true;
+
+    // Lifetime intrinsics are dead when their right-hand is undef.
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+        II->getIntrinsicID() == Intrinsic::lifetime_end)
+      return isa<UndefValue>(II->getArgOperand(1));
+  }
   return false;
 }
 
@@ -712,10 +721,14 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// their preferred alignment from the beginning.
 ///
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
-                                      unsigned PrefAlign) {
+                                      unsigned PrefAlign, const TargetData *TD) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    // If the preferred alignment is greater than the natural stack alignment
+    // then don't round up. This avoids dynamic stack realignment.
+    if (TD && TD->exceedsNaturalStackAlignment(PrefAlign))
+      return Align;
     // If there is a requested alignment and if this is an alloca, round up.
     if (AI->getAlignment() >= PrefAlign)
       return AI->getAlignment();
@@ -766,7 +779,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
   Align = std::min(Align, +Value::MaximumAlignment);
   
   if (PrefAlign > Align)
-    Align = enforceKnownAlignment(V, Align, PrefAlign);
+    Align = enforceKnownAlignment(V, Align, PrefAlign, TD);
     
   // We don't need to make any adjustment.
   return Align;
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index e79fb5a..cbd54a8 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -213,7 +213,7 @@ ReprocessLoop:
   // predecessors from outside of the loop, split the edge now.
   SmallVector<BasicBlock*, 8> ExitBlocks;
   L->getExitBlocks(ExitBlocks);
-    
+
   SmallSetVector<BasicBlock *, 8> ExitBlockSet(ExitBlocks.begin(),
                                                ExitBlocks.end());
   for (SmallSetVector<BasicBlock *, 8>::iterator I = ExitBlockSet.begin(),
@@ -325,6 +325,14 @@ ReprocessLoop:
       DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
                    << ExitingBlock->getName() << "\n");
 
+      // If any reachable control flow within this loop has changed, notify
+      // ScalarEvolution. Currently assume the parent loop doesn't change
+      // (spliting edges doesn't count). If blocks, CFG edges, or other values
+      // in the parent loop change, then we need call to forgetLoop() for the
+      // parent instead.
+      if (SE)
+        SE->forgetLoop(L);
+
       assert(pred_begin(ExitingBlock) == pred_end(ExitingBlock));
       Changed = true;
       LI->removeBlock(ExitingBlock);
@@ -402,13 +410,24 @@ BasicBlock *LoopSimplify::RewriteLoopExitBlock(Loop *L, BasicBlock *Exit) {
   }
 
   assert(!LoopBlocks.empty() && "No edges coming in from outside the loop?");
-  BasicBlock *NewBB = SplitBlockPredecessors(Exit, &LoopBlocks[0], 
-                                             LoopBlocks.size(), ".loopexit",
-                                             this);
+  BasicBlock *NewExitBB = 0;
+
+  if (Exit->isLandingPad()) {
+    SmallVector<BasicBlock*, 2> NewBBs;
+    SplitLandingPadPredecessors(Exit, ArrayRef<BasicBlock*>(&LoopBlocks[0],
+                                                            LoopBlocks.size()),
+                                ".loopexit", ".nonloopexit",
+                                this, NewBBs);
+    NewExitBB = NewBBs[0];
+  } else {
+    NewExitBB = SplitBlockPredecessors(Exit, &LoopBlocks[0],
+                                       LoopBlocks.size(), ".loopexit",
+                                       this);
+  }
 
   DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
-               << NewBB->getName() << "\n");
-  return NewBB;
+               << NewExitBB->getName() << "\n");
+  return NewExitBB;
 }
 
 /// AddBlockAndPredsToSet - Add the specified block, and all of its
@@ -467,23 +486,23 @@ void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
     if (&*BBI == SplitPreds[i])
       return;
   }
-  
+
   // If it isn't already after an outside block, move it after one.  This is
   // always good as it makes the uncond branch from the outside block into a
   // fall-through.
-  
+
   // Figure out *which* outside block to put this after.  Prefer an outside
   // block that neighbors a BB actually in the loop.
   BasicBlock *FoundBB = 0;
   for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
     Function::iterator BBI = SplitPreds[i];
-    if (++BBI != NewBB->getParent()->end() && 
+    if (++BBI != NewBB->getParent()->end() &&
         L->contains(BBI)) {
       FoundBB = SplitPreds[i];
       break;
     }
   }
-  
+
   // If our heuristic for a *good* bb to place this after doesn't find
   // anything, just pick something.  It's likely better than leaving it within
   // the loop.
@@ -544,7 +563,7 @@ Loop *LoopSimplify::SeparateNestedLoop(Loop *L, LPPassManager &LPM) {
   // Make sure that NewBB is put someplace intelligent, which doesn't mess up
   // code layout too horribly.
   PlaceSplitBlockCarefully(NewBB, OuterLoopPreds, L);
-  
+
   // Create the new outer loop.
   Loop *NewOuter = new Loop();
 
@@ -735,6 +754,7 @@ void LoopSimplify::verifyAnalysis() const {
       }
     assert(HasIndBrPred &&
            "LoopSimplify has no excuse for missing loop header info!");
+    (void)HasIndBrPred;
   }
 
   // Indirectbr can interfere with exit block canonicalization.
@@ -742,12 +762,15 @@ void LoopSimplify::verifyAnalysis() const {
     bool HasIndBrExiting = false;
     SmallVector<BasicBlock*, 8> ExitingBlocks;
     L->getExitingBlocks(ExitingBlocks);
-    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i)
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
         HasIndBrExiting = true;
         break;
       }
+    }
+
     assert(HasIndBrExiting &&
            "LoopSimplify has no excuse for missing exit block info!");
+    (void)HasIndBrExiting;
   }
 }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 6772511..62e4fa2 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -11,9 +11,6 @@
 // actual pass or policy, but provides a single function to perform loop
 // unrolling.
 //
-// It works best when loops have been canonicalized by the -indvars pass,
-// allowing it to determine the trip counts of loops easily.
-//
 // The process of unrolling can produce extraneous basic blocks linked with
 // unconditional branches.  This will be corrected in the future.
 //
@@ -24,6 +21,7 @@
 #include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Support/Debug.h"
@@ -31,6 +29,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
 using namespace llvm;
 
 // TODO: Should these be here or in LoopUnroll?
@@ -61,7 +60,8 @@ static inline void RemapInstruction(Instruction *I,
 /// only has one predecessor, and that predecessor only has one successor.
 /// The LoopInfo Analysis that is passed will be kept consistent.
 /// Returns the new combined block.
-static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
+static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
+                                            LPPassManager *LPM) {
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -93,6 +93,12 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
   std::string OldName = BB->getName();
 
   // Erase basic block from the function...
+
+  // ScalarEvolution holds references to loop exit blocks.
+  if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) {
+    if (Loop *L = LI->getLoopFor(BB))
+      SE->forgetLoop(L);
+  }
   LI->removeBlock(BB);
   BB->eraseFromParent();
 
@@ -109,12 +115,27 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI) {
 /// branch instruction. However, if the trip count (and multiple) are not known,
 /// loop unrolling will mostly produce more code that is no faster.
 ///
+/// TripCount is generally defined as the number of times the loop header
+/// executes. UnrollLoop relaxes the definition to permit early exits: here
+/// TripCount is the iteration on which control exits LatchBlock if no early
+/// exits were taken. Note that UnrollLoop assumes that the loop counter test
+/// terminates LatchBlock in order to remove unnecesssary instances of the
+/// test. In other words, control may exit the loop prior to TripCount
+/// iterations via an early branch, but control may not exit the loop from the
+/// LatchBlock's terminator prior to TripCount iterations.
+///
+/// Similarly, TripMultiple divides the number of times that the LatchBlock may
+/// execute without exiting the loop.
+///
 /// The LoopInfo Analysis that is passed will be kept consistent.
 ///
 /// If a LoopPassManager is passed in, and the loop is fully removed, it will be
 /// removed from the LoopPassManager as well. LPM can also be NULL.
-bool llvm::UnrollLoop(Loop *L, unsigned Count,
-                      LoopInfo *LI, LPPassManager *LPM) {
+///
+/// This utility preserves LoopInfo. If DominatorTree or ScalarEvolution are
+/// available it must also preserve those analyses.
+bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
+                      unsigned TripMultiple, LoopInfo *LI, LPPassManager *LPM) {
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
     DEBUG(dbgs() << "  Can't unroll; loop preheader-insertion failed.\n");
@@ -129,14 +150,14 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
 
   BasicBlock *Header = L->getHeader();
   BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-  
+
   if (!BI || BI->isUnconditional()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
     DEBUG(dbgs() <<
              "  Can't unroll; loop not terminated by a conditional branch.\n");
     return false;
   }
-  
+
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
     DEBUG(dbgs() <<
@@ -146,16 +167,10 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
 
   // Notify ScalarEvolution that the loop will be substantially changed,
   // if not outright eliminated.
-  if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>())
+  ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>();
+  if (SE)
     SE->forgetLoop(L);
 
-  // Find trip count
-  unsigned TripCount = L->getSmallConstantTripCount();
-  // Find trip multiple if count is not available
-  unsigned TripMultiple = 1;
-  if (TripCount == 0)
-    TripMultiple = L->getSmallConstantTripMultiple();
-
   if (TripCount != 0)
     DEBUG(dbgs() << "  Trip Count = " << TripCount << "\n");
   if (TripMultiple != 1)
@@ -208,12 +223,7 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
   ValueToValueMapTy LastValueMap;
   std::vector<PHINode*> OrigPHINode;
   for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    OrigPHINode.push_back(PN);
-    if (Instruction *I = 
-                dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
-      if (L->contains(I))
-        LastValueMap[I] = I;
+    OrigPHINode.push_back(cast<PHINode>(I));
   }
 
   std::vector<BasicBlock*> Headers;
@@ -221,11 +231,20 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
   Headers.push_back(Header);
   Latches.push_back(LatchBlock);
 
+  // The current on-the-fly SSA update requires blocks to be processed in
+  // reverse postorder so that LastValueMap contains the correct value at each
+  // exit.
+  LoopBlocksDFS DFS(L);
+  DFS.perform(LI);
+
+  // Stash the DFS iterators before adding blocks to the loop.
+  LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+  LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
-    
-    for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
-         E = LoopBlocks.end(); BB != E; ++BB) {
+
+    for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
       ValueToValueMapTy VMap;
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
@@ -251,75 +270,55 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
 
       L->addBasicBlockToLoop(New, LI->getBase());
 
-      // Add phi entries for newly created values to all exit blocks except
-      // the successor of the latch block.  The successor of the exit block will
-      // be updated specially after unrolling all the way.
-      if (*BB != LatchBlock)
-        for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB); SI != SE;
-             ++SI)
-          if (!L->contains(*SI))
-            for (BasicBlock::iterator BBI = (*SI)->begin();
-                 PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
-              Value *Incoming = phi->getIncomingValueForBlock(*BB);
-              phi->addIncoming(Incoming, New);
-            }
-
+      // Add phi entries for newly created values to all exit blocks.
+      for (succ_iterator SI = succ_begin(*BB), SE = succ_end(*BB);
+           SI != SE; ++SI) {
+        if (L->contains(*SI))
+          continue;
+        for (BasicBlock::iterator BBI = (*SI)->begin();
+             PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
+          Value *Incoming = phi->getIncomingValueForBlock(*BB);
+          ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
+          if (It != LastValueMap.end())
+            Incoming = It->second;
+          phi->addIncoming(Incoming, New);
+        }
+      }
       // Keep track of new headers and latches as we create them, so that
       // we can insert the proper branches later.
       if (*BB == Header)
         Headers.push_back(New);
-      if (*BB == LatchBlock) {
+      if (*BB == LatchBlock)
         Latches.push_back(New);
 
-        // Also, clear out the new latch's back edge so that it doesn't look
-        // like a new loop, so that it's amenable to being merged with adjacent
-        // blocks later on.
-        TerminatorInst *Term = New->getTerminator();
-        assert(L->contains(Term->getSuccessor(!ContinueOnTrue)));
-        assert(Term->getSuccessor(ContinueOnTrue) == LoopExit);
-        Term->setSuccessor(!ContinueOnTrue, NULL);
-      }
-
       NewBlocks.push_back(New);
     }
-    
+
     // Remap all instructions in the most recent iteration
     for (unsigned i = 0; i < NewBlocks.size(); ++i)
       for (BasicBlock::iterator I = NewBlocks[i]->begin(),
            E = NewBlocks[i]->end(); I != E; ++I)
         ::RemapInstruction(I, LastValueMap);
   }
-  
-  // The latch block exits the loop.  If there are any PHI nodes in the
-  // successor blocks, update them to use the appropriate values computed as the
-  // last iteration of the loop.
-  if (Count != 1) {
-    BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]);
-    for (succ_iterator SI = succ_begin(LatchBlock), SE = succ_end(LatchBlock);
-         SI != SE; ++SI) {
-      for (BasicBlock::iterator BBI = (*SI)->begin();
-           PHINode *PN = dyn_cast<PHINode>(BBI); ++BBI) {
-        Value *InVal = PN->removeIncomingValue(LatchBlock, false);
-        // If this value was defined in the loop, take the value defined by the
-        // last iteration of the loop.
-        if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
-          if (L->contains(InValI))
-            InVal = LastValueMap[InVal];
-        }
-        PN->addIncoming(InVal, LastIterationBB);
-      }
-    }
-  }
 
-  // Now, if we're doing complete unrolling, loop over the PHI nodes in the
-  // original block, setting them to their incoming values.
-  if (CompletelyUnroll) {
-    BasicBlock *Preheader = L->getLoopPreheader();
-    for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
-      PHINode *PN = OrigPHINode[i];
+  // Loop over the PHI nodes in the original block, setting incoming values.
+  for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+    PHINode *PN = OrigPHINode[i];
+    if (CompletelyUnroll) {
       PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
       Header->getInstList().erase(PN);
     }
+    else if (Count > 1) {
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI))
+          InVal = LastValueMap[InVal];
+      }
+      assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
+      PN->addIncoming(InVal, Latches.back());
+    }
   }
 
   // Now that all the basic blocks for the unrolled iterations are in place,
@@ -351,6 +350,19 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
       // iteration.
       Term->setSuccessor(!ContinueOnTrue, Dest);
     } else {
+      // Remove phi operands at this loop exit
+      if (Dest != LoopExit) {
+        BasicBlock *BB = Latches[i];
+        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+             SI != SE; ++SI) {
+          if (*SI == Headers[i])
+            continue;
+          for (BasicBlock::iterator BBI = (*SI)->begin();
+               PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
+            Phi->removeIncomingValue(BB, false);
+          }
+        }
+      }
       // Replace the conditional branch with an unconditional one.
       BranchInst::Create(Dest, Term);
       Term->eraseFromParent();
@@ -362,11 +374,29 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count,
     BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
     if (Term->isUnconditional()) {
       BasicBlock *Dest = Term->getSuccessor(0);
-      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI))
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest, LI, LPM))
         std::replace(Latches.begin(), Latches.end(), Dest, Fold);
     }
   }
-  
+
+  // FIXME: Reconstruct dom info, because it is not preserved properly.
+  // Incrementally updating domtree after loop unrolling would be easy.
+  if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>())
+    DT->runOnFunction(*L->getHeader()->getParent());
+
+  // Simplify any new induction variables in the partially unrolled loop.
+  if (SE && !CompletelyUnroll) {
+    SmallVector<WeakVH, 16> DeadInsts;
+    simplifyLoopIVs(L, SE, LPM, DeadInsts);
+
+    // Aggressively clean up dead instructions that simplifyLoopIVs already
+    // identified. Any remaining should be cleaned up below.
+    while (!DeadInsts.empty())
+      if (Instruction *Inst =
+          dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index c1213fa..61ab3f6 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -58,7 +58,7 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) {
     return false;
 
   LLVMContext &Context = CI->getContext();
-  const Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int32Ty = Type::getInt32Ty(Context);
 
   unsigned caseNo = SI->findCaseValue(ExpectedValue);
   std::vector<Value *> Vec;
@@ -105,7 +105,7 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) {
     return false;
 
   LLVMContext &Context = CI->getContext();
-  const Type *Int32Ty = Type::getInt32Ty(Context);
+  Type *Int32Ty = Type::getInt32Ty(Context);
   bool Likely = ExpectedValue->isOne();
 
   // If expect value is equal to 1 it means that we are more likely to take
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index f77d19d..c96c8fc 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -120,18 +120,18 @@ FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI,
 // doInitialization - Make sure that there is a prototype for abort in the
 // current module.
 bool LowerInvoke::doInitialization(Module &M) {
-  const Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   if (useExpensiveEHSupport) {
     // Insert a type for the linked list of jump buffers.
     unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
     JBSize = JBSize ? JBSize : 200;
     Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize);
 
-    JBLinkTy = StructType::createNamed(M.getContext(), "llvm.sjljeh.jmpbufty");
+    JBLinkTy = StructType::create(M.getContext(), "llvm.sjljeh.jmpbufty");
     Type *Elts[] = { JmpBufTy, PointerType::getUnqual(JBLinkTy) };
     JBLinkTy->setBody(Elts);
 
-    const Type *PtrJBList = PointerType::getUnqual(JBLinkTy);
+    Type *PtrJBList = PointerType::getUnqual(JBLinkTy);
 
     // Now that we've done that, insert the jmpbuf list head global, unless it
     // already exists.
@@ -240,14 +240,14 @@ void LowerInvoke::rewriteExpensiveInvoke(InvokeInst *II, unsigned InvokeNo,
   CallInst* StackSaveRet = CallInst::Create(StackSaveFn, "ssret", II);
   new StoreInst(StackSaveRet, StackPtr, true, II); // volatile
 
-  BasicBlock::iterator NI = II->getNormalDest()->getFirstNonPHI();
+  BasicBlock::iterator NI = II->getNormalDest()->getFirstInsertionPt();
   // nonvolatile.
   new StoreInst(Constant::getNullValue(Type::getInt32Ty(II->getContext())), 
                 InvokeNum, false, NI);
 
-  Instruction* StackPtrLoad = new LoadInst(StackPtr, "stackptr.restore", true,
-                                           II->getUnwindDest()->getFirstNonPHI()
-                                           );
+  Instruction* StackPtrLoad =
+    new LoadInst(StackPtr, "stackptr.restore", true,
+                 II->getUnwindDest()->getFirstInsertionPt());
   CallInst::Create(StackRestoreFn, StackPtrLoad, "")->insertAfter(StackPtrLoad);
     
   // Add a switch case to our unwind block.
@@ -305,7 +305,7 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
     ++AfterAllocaInsertPt;
   for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
        AI != E; ++AI) {
-    const Type *Ty = AI->getType();
+    Type *Ty = AI->getType();
     // Aggregate types can't be cast, but are legal argument types, so we have
     // to handle them differently. We use an extract/insert pair as a
     // lightweight method to achieve the same goal.
@@ -406,6 +406,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
   SmallVector<ReturnInst*,16> Returns;
   SmallVector<UnwindInst*,16> Unwinds;
   SmallVector<InvokeInst*,16> Invokes;
+  UnreachableInst* UnreachablePlaceholder = 0;
 
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
@@ -455,8 +456,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
 
     Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
                      ConstantInt::get(Type::getInt32Ty(F.getContext()), 1) };
-    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2],
-                                             "OldBuf",
+    OldJmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "OldBuf",
                                              EntryBB->getTerminator());
 
     // Copy the JBListHead to the alloca.
@@ -487,9 +487,10 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
 
     // Insert a load in the Catch block, and a switch on its value.  By default,
     // we go to a block that just does an unwind (which is the correct action
-    // for a standard call).
+    // for a standard call). We insert an unreachable instruction here and
+    // modify the block to jump to the correct unwinding pad later.
     BasicBlock *UnwindBB = BasicBlock::Create(F.getContext(), "unwindbb", &F);
-    Unwinds.push_back(new UnwindInst(F.getContext(), UnwindBB));
+    UnreachablePlaceholder = new UnreachableInst(F.getContext(), UnwindBB);
 
     Value *CatchLoad = new LoadInst(InvokeNum, "invoke.num", true, CatchBB);
     SwitchInst *CatchSwitch =
@@ -502,8 +503,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
                                                      "setjmp.cont");
 
     Idx[1] = ConstantInt::get(Type::getInt32Ty(F.getContext()), 0);
-    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, &Idx[0], &Idx[2],
-                                                 "TheJmpBuf",
+    Value *JmpBufPtr = GetElementPtrInst::Create(JmpBuf, Idx, "TheJmpBuf",
                                                  EntryBB->getTerminator());
     JmpBufPtr = new BitCastInst(JmpBufPtr,
                         Type::getInt8PtrTy(F.getContext()),
@@ -557,8 +557,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
   // Get a pointer to the jmpbuf and longjmp.
   Value *Idx[] = { Constant::getNullValue(Type::getInt32Ty(F.getContext())),
                    ConstantInt::get(Type::getInt32Ty(F.getContext()), 0) };
-  Idx[0] = GetElementPtrInst::Create(BufPtr, &Idx[0], &Idx[2], "JmpBuf",
-                                     UnwindBlock);
+  Idx[0] = GetElementPtrInst::Create(BufPtr, Idx, "JmpBuf", UnwindBlock);
   Idx[0] = new BitCastInst(Idx[0],
              Type::getInt8PtrTy(F.getContext()),
                            "tmp", UnwindBlock);
@@ -580,6 +579,12 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
     Unwinds[i]->eraseFromParent();
   }
 
+  // Replace the inserted unreachable with a branch to the unwind handler.
+  if (UnreachablePlaceholder) {
+    BranchInst::Create(UnwindHandler, UnreachablePlaceholder);
+    UnreachablePlaceholder->eraseFromParent();
+  }
+
   // Finally, for any returns from this function, if this function contains an
   // invoke, restore the old jmpbuf pointer to its input value.
   if (OldJmpBufPtr) {
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index ed733d3..686178c 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -277,11 +277,11 @@ void LowerSwitch::processSwitchInst(SwitchInst *SI) {
   BasicBlock *CurBlock = SI->getParent();
   BasicBlock *OrigBlock = CurBlock;
   Function *F = CurBlock->getParent();
-  Value *Val = SI->getOperand(0);  // The value we are switching on...
+  Value *Val = SI->getCondition();  // The value we are switching on...
   BasicBlock* Default = SI->getDefaultDest();
 
   // If there is only the default destination, don't bother with the code below.
-  if (SI->getNumOperands() == 2) {
+  if (SI->getNumCases() == 1) {
     BranchInst::Create(SI->getDefaultDest(), CurBlock);
     CurBlock->getInstList().erase(SI);
     return;
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index e5a00f4..db3e942 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -86,11 +86,15 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
        UI != UE; ++UI) {   // Loop over all of the uses of the alloca
     const User *U = *UI;
     if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Note that atomic loads can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
       if (LI->isVolatile())
         return false;
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == AI)
         return false;   // Don't allow a store OF the AI, only INTO the AI.
+      // Note that atomic stores can be transformed; atomic semantics do
+      // not have any meaning for a local alloca.
       if (SI->isVolatile())
         return false;
     } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index b47a7cc..fa8061c 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
@@ -43,7 +44,7 @@ SSAUpdater::~SSAUpdater() {
 
 /// Initialize - Reset this object to get ready for a new set of SSA
 /// updates with type 'Ty'.  PHI nodes get a name based on 'Name'.
-void SSAUpdater::Initialize(const Type *Ty, StringRef Name) {
+void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
   if (AV == 0)
     AV = new AvailableValsTy();
   else
@@ -378,8 +379,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
   // First step: bucket up uses of the alloca by the block they occur in.
   // This is important because we have to handle multiple defs/uses in a block
   // ourselves: SSAUpdater is purely for cross-block references.
-  // FIXME: Want a TinyVector<Instruction*> since there is often 0/1 element.
-  DenseMap<BasicBlock*, std::vector<Instruction*> > UsesByBlock;
+  DenseMap<BasicBlock*, TinyPtrVector<Instruction*> > UsesByBlock;
   
   for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
     Instruction *User = Insts[i];
@@ -395,7 +395,7 @@ run(const SmallVectorImpl<Instruction*> &Insts) const {
   for (unsigned i = 0, e = Insts.size(); i != e; ++i) {
     Instruction *User = Insts[i];
     BasicBlock *BB = User->getParent();
-    std::vector<Instruction*> &BlockUses = UsesByBlock[BB];
+    TinyPtrVector<Instruction*> &BlockUses = UsesByBlock[BB];
     
     // If this block has already been processed, ignore this repeat use.
     if (BlockUses.empty()) continue;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 9d9c324..b8c3ab4 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -63,6 +63,7 @@ class SimplifyCFGOpt {
   bool FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
                                            IRBuilder<> &Builder);
 
+  bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
   bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
   bool SimplifyUnwind(UnwindInst *UI, IRBuilder<> &Builder);
   bool SimplifyUnreachable(UnreachableInst *UI);
@@ -322,7 +323,7 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
 
   // This is some kind of pointer constant. Turn it into a pointer-sized
   // ConstantInt if possible.
-  const IntegerType *PtrTy = TD->getIntPtrType(V->getContext());
+  IntegerType *PtrTy = TD->getIntPtrType(V->getContext());
 
   // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
   if (isa<ConstantPointerNull>(V))
@@ -2138,6 +2139,52 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   return true;
 }
 
+bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
+  // If this is a trivial landing pad that just continues unwinding the caught
+  // exception then zap the landing pad, turning its invokes into calls.
+  BasicBlock *BB = RI->getParent();
+  LandingPadInst *LPInst = dyn_cast<LandingPadInst>(BB->getFirstNonPHI());
+  if (RI->getValue() != LPInst)
+    // Not a landing pad, or the resume is not unwinding the exception that
+    // caused control to branch here.
+    return false;
+
+  // Check that there are no other instructions except for debug intrinsics.
+  BasicBlock::iterator I = LPInst, E = RI;
+  while (++I != E)
+    if (!isa<DbgInfoIntrinsic>(I))
+      return false;
+
+  // Turn all invokes that unwind here into calls and delete the basic block.
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+    InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator());
+    SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+    // Insert a call instruction before the invoke.
+    CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
+    Call->takeName(II);
+    Call->setCallingConv(II->getCallingConv());
+    Call->setAttributes(II->getAttributes());
+    Call->setDebugLoc(II->getDebugLoc());
+
+    // Anything that used the value produced by the invoke instruction now uses
+    // the value produced by the call instruction.  Note that we do this even
+    // for void functions and calls with no uses so that the callgraph edge is
+    // updated.
+    II->replaceAllUsesWith(Call);
+    BB->removePredecessor(II->getParent());
+
+    // Insert a branch to the normal destination right before the invoke.
+    BranchInst::Create(II->getNormalDest(), II);
+
+    // Finally, delete the invoke instruction!
+    II->eraseFromParent();
+  }
+
+  // The landingpad is now unreachable.  Zap it.
+  BB->eraseFromParent();
+  return true;
+}
+
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
@@ -2244,18 +2291,34 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   while (UI != BB->begin()) {
     BasicBlock::iterator BBI = UI;
     --BBI;
-    // Do not delete instructions that can have side effects, like calls
-    // (which may never return) and volatile loads and stores.
+    // Do not delete instructions that can have side effects which might cause
+    // the unreachable to not be reachable; specifically, calls and volatile
+    // operations may have this effect.
     if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI)) break;
-    
-    if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
-      if (SI->isVolatile())
-        break;
-    
-    if (LoadInst *LI = dyn_cast<LoadInst>(BBI))
-      if (LI->isVolatile())
+
+    if (BBI->mayHaveSideEffects()) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        if (SI->isVolatile())
+          break;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+        if (LI->isVolatile())
+          break;
+      } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+        if (RMWI->isVolatile())
+          break;
+      } else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+        if (CXI->isVolatile())
+          break;
+      } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
+                 !isa<LandingPadInst>(BBI)) {
         break;
-    
+      }
+      // Note that deleting LandingPad's here is in fact okay, although it
+      // involves a bit of subtle reasoning. If this inst is a LandingPad,
+      // all the predecessors of this block will be the unwind edges of Invokes,
+      // and we can therefore guarantee this block will be erased.
+    }
+
     // Delete this instruction (any uses are guaranteed to be dead)
     if (!BBI->use_empty())
       BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
@@ -2707,6 +2770,71 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   return false;
 }
 
+/// Check if passing a value to an instruction will cause undefined behavior.
+static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (!I->hasOneUse()) // Only look at single-use instructions, for compile time
+    return false;
+
+  if (C->isNullValue()) {
+    Instruction *Use = I->use_back();
+
+    // Now make sure that there are no instructions in between that can alter
+    // control flow (eg. calls)
+    for (BasicBlock::iterator i = ++BasicBlock::iterator(I); &*i != Use; ++i)
+      if (i == I->getParent()->end() || i->mayHaveSideEffects())
+        return false;
+
+    // Look through GEPs. A load from a GEP derived from NULL is still undefined
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
+      if (GEP->getPointerOperand() == I)
+        return passingValueIsAlwaysUndefined(V, GEP);
+
+    // Look through bitcasts.
+    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
+      return passingValueIsAlwaysUndefined(V, BC);
+
+    // Load from null is undefined.
+    if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+      return LI->getPointerAddressSpace() == 0;
+
+    // Store to null is undefined.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+      return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
+  }
+  return false;
+}
+
+/// If BB has an incoming value that will always trigger undefined behavior
+/// (eg. null pointer derefence), remove the branch leading here.
+static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
+  for (BasicBlock::iterator i = BB->begin();
+       PHINode *PHI = dyn_cast<PHINode>(i); ++i)
+    for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+      if (passingValueIsAlwaysUndefined(PHI->getIncomingValue(i), PHI)) {
+        TerminatorInst *T = PHI->getIncomingBlock(i)->getTerminator();
+        IRBuilder<> Builder(T);
+        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+          BB->removePredecessor(PHI->getIncomingBlock(i));
+          // Turn uncoditional branches into unreachables and remove the dead
+          // destination from conditional branches.
+          if (BI->isUnconditional())
+            Builder.CreateUnreachable();
+          else
+            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1) :
+                                                         BI->getSuccessor(0));
+          BI->eraseFromParent();
+          return true;
+        }
+        // TODO: SwitchInst.
+      }
+
+  return false;
+}
+
 bool SimplifyCFGOpt::run(BasicBlock *BB) {
   bool Changed = false;
 
@@ -2730,6 +2858,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   // Check for and eliminate duplicate PHI nodes in this block.
   Changed |= EliminateDuplicatePHINodes(BB);
 
+  // Check for and remove branches that will always cause undefined behavior.
+  Changed |= removeUndefIntroducingPredecessor(BB);
+
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
@@ -2752,6 +2883,8 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
     } else {
       if (SimplifyCondBranch(BI, Builder)) return true;
     }
+  } else if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
+    if (SimplifyResume(RI, Builder)) return true;
   } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
     if (SimplifyReturn(RI, Builder)) return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
new file mode 100644
index 0000000..76289c0
--- /dev/null
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -0,0 +1,432 @@
+//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements induction variable simplification. It does
+// not define any actual pass or policy, but provides a single function to
+// simplify a loop's induction variables based on ScalarEvolution.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indvars"
+
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
+STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
+STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
+
+namespace {
+  /// SimplifyIndvar - This is a utility for simplifying induction variables
+  /// based on ScalarEvolution. It is the primary instrument of the
+  /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
+  /// other loop passes that preserve SCEV.
+  class SimplifyIndvar {
+    Loop             *L;
+    LoopInfo         *LI;
+    DominatorTree    *DT;
+    ScalarEvolution  *SE;
+    IVUsers          *IU; // NULL for DisableIVRewrite
+    const TargetData *TD; // May be NULL
+
+    SmallVectorImpl<WeakVH> &DeadInsts;
+
+    bool Changed;
+
+  public:
+    SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, LPPassManager *LPM,
+                   SmallVectorImpl<WeakVH> &Dead, IVUsers *IVU = NULL) :
+      L(Loop),
+      LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
+      SE(SE),
+      IU(IVU),
+      TD(LPM->getAnalysisIfAvailable<TargetData>()),
+      DeadInsts(Dead),
+      Changed(false) {
+      assert(LI && "IV simplification requires LoopInfo");
+    }
+
+    bool hasChanged() const { return Changed; }
+
+    /// Iteratively perform simplification on a worklist of users of the
+    /// specified induction variable. This is the top-level driver that applies
+    /// all simplicitions to users of an IV.
+    void simplifyUsers(PHINode *CurrIV, IVVisitor *V = NULL);
+
+    Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
+
+    bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
+    void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+    void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+                              bool IsSigned);
+  };
+}
+
+/// foldIVUser - Fold an IV operand into its use.  This removes increments of an
+/// aligned IV when used by a instruction that ignores the low bits.
+///
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+///
+/// Return the operand of IVOperand for this induction variable if IVOperand can
+/// be folded (in case more folding opportunities have been exposed).
+/// Otherwise return null.
+Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
+  Value *IVSrc = 0;
+  unsigned OperIdx = 0;
+  const SCEV *FoldedExpr = 0;
+  switch (UseInst->getOpcode()) {
+  default:
+    return 0;
+  case Instruction::UDiv:
+  case Instruction::LShr:
+    // We're only interested in the case where we know something about
+    // the numerator and have a constant denominator.
+    if (IVOperand != UseInst->getOperand(OperIdx) ||
+        !isa<ConstantInt>(UseInst->getOperand(1)))
+      return 0;
+
+    // Attempt to fold a binary operator with constant operand.
+    // e.g. ((I + 1) >> 2) => I >> 2
+    if (IVOperand->getNumOperands() != 2 ||
+        !isa<ConstantInt>(IVOperand->getOperand(1)))
+      return 0;
+
+    IVSrc = IVOperand->getOperand(0);
+    // IVSrc must be the (SCEVable) IV, since the other operand is const.
+    assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand");
+
+    ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1));
+    if (UseInst->getOpcode() == Instruction::LShr) {
+      // Get a constant for the divisor. See createSCEV.
+      uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
+      if (D->getValue().uge(BitWidth))
+        return 0;
+
+      D = ConstantInt::get(UseInst->getContext(),
+                           APInt(BitWidth, 1).shl(D->getZExtValue()));
+    }
+    FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+  }
+  // We have something that might fold it's operand. Compare SCEVs.
+  if (!SE->isSCEVable(UseInst->getType()))
+    return 0;
+
+  // Bypass the operand if SCEV can prove it has no effect.
+  if (SE->getSCEV(UseInst) != FoldedExpr)
+    return 0;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+        << " -> " << *UseInst << '\n');
+
+  UseInst->setOperand(OperIdx, IVSrc);
+  assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
+
+  ++NumElimOperand;
+  Changed = true;
+  if (IVOperand->use_empty())
+    DeadInsts.push_back(IVOperand);
+  return IVSrc;
+}
+
+/// eliminateIVComparison - SimplifyIVUsers helper for eliminating useless
+/// comparisons against an induction variable.
+void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+  unsigned IVOperIdx = 0;
+  ICmpInst::Predicate Pred = ICmp->getPredicate();
+  if (IVOperand != ICmp->getOperand(0)) {
+    // Swapped
+    assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+    IVOperIdx = 1;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(ICmp->getOperand(IVOperIdx));
+  const SCEV *X = SE->getSCEV(ICmp->getOperand(1 - IVOperIdx));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // If the condition is always true or always false, replace it with
+  // a constant value.
+  if (SE->isKnownPredicate(Pred, S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+  else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X))
+    ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+  else
+    return;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+  ++NumElimCmp;
+  Changed = true;
+  DeadInsts.push_back(ICmp);
+}
+
+/// eliminateIVRemainder - SimplifyIVUsers helper for eliminating useless
+/// remainder operations operating on an induction variable.
+void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
+                                      Value *IVOperand,
+                                      bool IsSigned) {
+  // We're only interested in the case where we know something about
+  // the numerator.
+  if (IVOperand != Rem->getOperand(0))
+    return;
+
+  // Get the SCEVs for the ICmp operands.
+  const SCEV *S = SE->getSCEV(Rem->getOperand(0));
+  const SCEV *X = SE->getSCEV(Rem->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+  S = SE->getSCEVAtScope(S, ICmpLoop);
+  X = SE->getSCEVAtScope(X, ICmpLoop);
+
+  // i % n  -->  i  if i is in [0,n).
+  if ((!IsSigned || SE->isKnownNonNegative(S)) &&
+      SE->isKnownPredicate(IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                           S, X))
+    Rem->replaceAllUsesWith(Rem->getOperand(0));
+  else {
+    // (i+1) % n  -->  (i+1)==n?0:(i+1)  if i is in [0,n).
+    const SCEV *LessOne =
+      SE->getMinusSCEV(S, SE->getConstant(S->getType(), 1));
+    if (IsSigned && !SE->isKnownNonNegative(LessOne))
+      return;
+
+    if (!SE->isKnownPredicate(IsSigned ?
+                              ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT,
+                              LessOne, X))
+      return;
+
+    ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ,
+                                  Rem->getOperand(0), Rem->getOperand(1));
+    SelectInst *Sel =
+      SelectInst::Create(ICmp,
+                         ConstantInt::get(Rem->getType(), 0),
+                         Rem->getOperand(0), "tmp", Rem);
+    Rem->replaceAllUsesWith(Sel);
+  }
+
+  // Inform IVUsers about the new users.
+  if (IU) {
+    if (Instruction *I = dyn_cast<Instruction>(Rem->getOperand(0)))
+      IU->AddUsersIfInteresting(I);
+  }
+  DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+  ++NumElimRem;
+  Changed = true;
+  DeadInsts.push_back(Rem);
+}
+
+/// eliminateIVUser - Eliminate an operation that consumes a simple IV and has
+/// no observable side-effect given the range of IV values.
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
+                                     Instruction *IVOperand) {
+  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+    eliminateIVComparison(ICmp, IVOperand);
+    return true;
+  }
+  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
+    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
+      eliminateIVRemainder(Rem, IVOperand, IsSigned);
+      return true;
+    }
+  }
+
+  // Eliminate any operation that SCEV can prove is an identity function.
+  if (!SE->isSCEVable(UseInst->getType()) ||
+      (UseInst->getType() != IVOperand->getType()) ||
+      (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
+    return false;
+
+  DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+
+  UseInst->replaceAllUsesWith(IVOperand);
+  ++NumElimIdentity;
+  Changed = true;
+  DeadInsts.push_back(UseInst);
+  return true;
+}
+
+/// pushIVUsers - Add all uses of Def to the current IV's worklist.
+///
+static void pushIVUsers(
+  Instruction *Def,
+  SmallPtrSet<Instruction*,16> &Simplified,
+  SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
+
+  for (Value::use_iterator UI = Def->use_begin(), E = Def->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Avoid infinite or exponential worklist processing.
+    // Also ensure unique worklist users.
+    // If Def is a LoopPhi, it may not be in the Simplified set, so check for
+    // self edges first.
+    if (User != Def && Simplified.insert(User))
+      SimpleIVUsers.push_back(std::make_pair(User, Def));
+  }
+}
+
+/// isSimpleIVUser - Return true if this instruction generates a simple SCEV
+/// expression in terms of that IV.
+///
+/// This is similar to IVUsers' isInteresting() but processes each instruction
+/// non-recursively when the operand is already known to be a simpleIVUser.
+///
+static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;
+
+  // Get the symbolic expression for this instruction.
+  const SCEV *S = SE->getSCEV(I);
+
+  // Only consider affine recurrences.
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+  if (AR && AR->getLoop() == L)
+    return true;
+
+  return false;
+}
+
+/// simplifyUsers - Iteratively perform simplification on a worklist of users
+/// of the specified induction variable. Each successive simplification may push
+/// more users which may themselves be candidates for simplification.
+///
+/// This algorithm does not require IVUsers analysis. Instead, it simplifies
+/// instructions in-place during analysis. Rather than rewriting induction
+/// variables bottom-up from their users, it transforms a chain of IVUsers
+/// top-down, updating the IR only when it encouters a clear optimization
+/// opportunitiy.
+///
+/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
+///
+void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
+  if (!SE->isSCEVable(CurrIV->getType()))
+    return;
+
+  // Instructions processed by SimplifyIndvar for CurrIV.
+  SmallPtrSet<Instruction*,16> Simplified;
+
+  // Use-def pairs if IV users waiting to be processed for CurrIV.
+  SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
+
+  // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
+  // called multiple times for the same LoopPhi. This is the proper thing to
+  // do for loop header phis that use each other.
+  pushIVUsers(CurrIV, Simplified, SimpleIVUsers);
+
+  while (!SimpleIVUsers.empty()) {
+    std::pair<Instruction*, Instruction*> UseOper =
+      SimpleIVUsers.pop_back_val();
+    // Bypass back edges to avoid extra work.
+    if (UseOper.first == CurrIV) continue;
+
+    Instruction *IVOperand = UseOper.second;
+    for (unsigned N = 0; IVOperand; ++N) {
+      assert(N <= Simplified.size() && "runaway iteration");
+
+      Value *NewOper = foldIVUser(UseOper.first, IVOperand);
+      if (!NewOper)
+        break; // done folding
+      IVOperand = dyn_cast<Instruction>(NewOper);
+    }
+    if (!IVOperand)
+      continue;
+
+    if (eliminateIVUser(UseOper.first, IVOperand)) {
+      pushIVUsers(IVOperand, Simplified, SimpleIVUsers);
+      continue;
+    }
+    CastInst *Cast = dyn_cast<CastInst>(UseOper.first);
+    if (V && Cast) {
+      V->visitCast(Cast);
+      continue;
+    }
+    if (isSimpleIVUser(UseOper.first, L, SE)) {
+      pushIVUsers(UseOper.first, Simplified, SimpleIVUsers);
+    }
+  }
+}
+
+namespace llvm {
+
+/// simplifyUsersOfIV - Simplify instructions that use this induction variable
+/// by using ScalarEvolution to analyze the IV's recurrence.
+bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, LPPassManager *LPM,
+                       SmallVectorImpl<WeakVH> &Dead, IVVisitor *V)
+{
+  LoopInfo *LI = &LPM->getAnalysis<LoopInfo>();
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, LPM, Dead);
+  SIV.simplifyUsers(CurrIV, V);
+  return SIV.hasChanged();
+}
+
+/// simplifyLoopIVs - Simplify users of induction variables within this
+/// loop. This does not actually change or add IVs.
+bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, LPPassManager *LPM,
+                     SmallVectorImpl<WeakVH> &Dead) {
+  bool Changed = false;
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, LPM, Dead);
+  }
+  return Changed;
+}
+
+/// simplifyIVUsers - Perform simplification on instructions recorded by the
+/// IVUsers pass.
+///
+/// This is the old approach to IV simplification to be replaced by
+/// SimplifyLoopIVs.
+bool simplifyIVUsers(IVUsers *IU, ScalarEvolution *SE, LPPassManager *LPM,
+                     SmallVectorImpl<WeakVH> &Dead) {
+  SimplifyIndvar SIV(IU->getLoop(), SE, LPM, Dead);
+
+  // Each round of simplification involves a round of eliminating operations
+  // followed by a round of widening IVs. A single IVUsers worklist is used
+  // across all rounds. The inner loop advances the user. If widening exposes
+  // more uses, then another pass through the outer loop is triggered.
+  for (IVUsers::iterator I = IU->begin(); I != IU->end(); ++I) {
+    Instruction *UseInst = I->getUser();
+    Value *IVOperand = I->getOperandValToReplace();
+
+    if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+      SIV.eliminateIVComparison(ICmp, IVOperand);
+      continue;
+    }
+    if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
+      bool IsSigned = Rem->getOpcode() == Instruction::SRem;
+      if (IsSigned || Rem->getOpcode() == Instruction::URem) {
+        SIV.eliminateIVRemainder(Rem, IVOperand, IsSigned);
+        continue;
+      }
+    }
+  }
+  return SIV.hasChanged();
+}
+
+} // namespace llvm
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 973b105..fc2538d 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -183,10 +183,9 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
     }
   }
 
-  // Remap attached metadata.  Don't bother remapping DebugLoc, it can never
-  // have mappings to do.
+  // Remap attached metadata.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
-  I->getAllMetadataOtherThanDebugLoc(MDs);
+  I->getAllMetadata(MDs);
   for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
        MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
     MDNode *Old = MI->second;
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 94794c3..18308f2 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -58,7 +58,7 @@ static const Module *getModuleFromVal(const Value *V) {
     const Function *M = I->getParent() ? I->getParent()->getParent() : 0;
     return M ? M->getParent() : 0;
   }
-  
+
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return GV->getParent();
   return 0;
@@ -142,18 +142,18 @@ public:
 
   /// NamedTypes - The named types that are used by the current module.
   std::vector<StructType*> NamedTypes;
-  
+
   /// NumberedTypes - The numbered types, along with their value.
   DenseMap<StructType*, unsigned> NumberedTypes;
-  
+
 
   TypePrinting() {}
   ~TypePrinting() {}
-  
+
   void incorporateTypes(const Module &M);
-  
+
   void print(Type *Ty, raw_ostream &OS);
-  
+
   void printStructBody(StructType *Ty, raw_ostream &OS);
 };
 } // end anonymous namespace.
@@ -161,25 +161,25 @@ public:
 
 void TypePrinting::incorporateTypes(const Module &M) {
   M.findUsedStructTypes(NamedTypes);
-  
+
   // The list of struct types we got back includes all the struct types, split
   // the unnamed ones out to a numbering and remove the anonymous structs.
   unsigned NextNumber = 0;
-  
+
   std::vector<StructType*>::iterator NextToUse = NamedTypes.begin(), I, E;
   for (I = NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I) {
     StructType *STy = *I;
-    
+
     // Ignore anonymous types.
-    if (STy->isAnonymous())
+    if (STy->isLiteral())
       continue;
-    
+
     if (STy->getName().empty())
       NumberedTypes[STy] = NextNumber++;
     else
       *NextToUse++ = STy;
   }
-    
+
   NamedTypes.erase(NextToUse, NamedTypes.end());
 }
 
@@ -220,13 +220,13 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   }
   case Type::StructTyID: {
     StructType *STy = cast<StructType>(Ty);
-    
-    if (STy->isAnonymous())
+
+    if (STy->isLiteral())
       return printStructBody(STy, OS);
 
     if (!STy->getName().empty())
       return PrintLLVMName(OS, STy->getName(), LocalPrefix);
-    
+
     DenseMap<StructType*, unsigned>::iterator I = NumberedTypes.find(STy);
     if (I != NumberedTypes.end())
       OS << '%' << I->second;
@@ -267,10 +267,10 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
     OS << "opaque";
     return;
   }
-  
+
   if (STy->isPacked())
     OS << '<';
-  
+
   if (STy->getNumElements() == 0) {
     OS << "{}";
   } else {
@@ -281,7 +281,7 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
       OS << ", ";
       print(*I, OS);
     }
-  
+
     OS << " }";
   }
   if (STy->isPacked())
@@ -386,7 +386,8 @@ static SlotTracker *createSlotTracker(const Value *V) {
     return new SlotTracker(FA->getParent());
 
   if (const Instruction *I = dyn_cast<Instruction>(V))
-    return new SlotTracker(I->getParent()->getParent());
+    if (I->getParent())
+      return new SlotTracker(I->getParent()->getParent());
 
   if (const BasicBlock *BB = dyn_cast<BasicBlock>(V))
     return new SlotTracker(BB->getParent());
@@ -419,7 +420,7 @@ static SlotTracker *createSlotTracker(const Value *V) {
 // Module level constructor. Causes the contents of the Module (sans functions)
 // to be added to the slot table.
 SlotTracker::SlotTracker(const Module *M)
-  : TheModule(M), TheFunction(0), FunctionProcessed(false), 
+  : TheModule(M), TheFunction(0), FunctionProcessed(false),
     mNext(0), fNext(0),  mdnNext(0) {
 }
 
@@ -490,12 +491,12 @@ void SlotTracker::processFunction() {
        E = TheFunction->end(); BB != E; ++BB) {
     if (!BB->hasName())
       CreateFunctionSlot(BB);
-    
+
     for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E;
          ++I) {
       if (!I->getType()->isVoidTy() && !I->hasName())
         CreateFunctionSlot(I);
-      
+
       // Intrinsics can directly use metadata.  We allow direct calls to any
       // llvm.foo function here, because the target may not be linked into the
       // optimizer.
@@ -658,6 +659,23 @@ static const char *getPredicateText(unsigned predicate) {
   return pred;
 }
 
+static void writeAtomicRMWOperation(raw_ostream &Out,
+                                    AtomicRMWInst::BinOp Op) {
+  switch (Op) {
+  default: Out << " <unknown operation " << Op << ">"; break;
+  case AtomicRMWInst::Xchg: Out << " xchg"; break;
+  case AtomicRMWInst::Add:  Out << " add"; break;
+  case AtomicRMWInst::Sub:  Out << " sub"; break;
+  case AtomicRMWInst::And:  Out << " and"; break;
+  case AtomicRMWInst::Nand: Out << " nand"; break;
+  case AtomicRMWInst::Or:   Out << " or"; break;
+  case AtomicRMWInst::Xor:  Out << " xor"; break;
+  case AtomicRMWInst::Max:  Out << " max"; break;
+  case AtomicRMWInst::Min:  Out << " min"; break;
+  case AtomicRMWInst::UMax: Out << " umax"; break;
+  case AtomicRMWInst::UMin: Out << " umin"; break;
+  }
+}
 
 static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
   if (const OverflowingBinaryOperator *OBO =
@@ -792,7 +810,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
     Out << "zeroinitializer";
     return;
   }
-  
+
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV)) {
     Out << "blockaddress(";
     WriteAsOperandInternal(Out, BA->getFunction(), &TypePrinter, Machine,
@@ -939,13 +957,13 @@ static void WriteMDNodeBodyInternal(raw_ostream &Out, const MDNode *Node,
     else {
       TypePrinter->print(V->getType(), Out);
       Out << ' ';
-      WriteAsOperandInternal(Out, Node->getOperand(mi), 
+      WriteAsOperandInternal(Out, Node->getOperand(mi),
                              TypePrinter, Machine, Context);
     }
     if (mi + 1 != me)
       Out << ", ";
   }
-  
+
   Out << "}";
 }
 
@@ -990,7 +1008,7 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
       WriteMDNodeBodyInternal(Out, N, TypePrinter, Machine, Context);
       return;
     }
-  
+
     if (!Machine) {
       if (N->isFunctionLocal())
         Machine = new SlotTracker(N->getFunction());
@@ -1020,26 +1038,35 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
 
   char Prefix = '%';
   int Slot;
+  // If we have a SlotTracker, use it.
   if (Machine) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
       Slot = Machine->getGlobalSlot(GV);
       Prefix = '@';
     } else {
       Slot = Machine->getLocalSlot(V);
+
+      // If the local value didn't succeed, then we may be referring to a value
+      // from a different function.  Translate it, as this can happen when using
+      // address of blocks.
+      if (Slot == -1)
+        if ((Machine = createSlotTracker(V))) {
+          Slot = Machine->getLocalSlot(V);
+          delete Machine;
+        }
     }
-  } else {
-    Machine = createSlotTracker(V);
-    if (Machine) {
-      if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-        Slot = Machine->getGlobalSlot(GV);
-        Prefix = '@';
-      } else {
-        Slot = Machine->getLocalSlot(V);
-      }
-      delete Machine;
+  } else if ((Machine = createSlotTracker(V))) {
+    // Otherwise, create one to get the # and then destroy it.
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+      Slot = Machine->getGlobalSlot(GV);
+      Prefix = '@';
     } else {
-      Slot = -1;
+      Slot = Machine->getLocalSlot(V);
     }
+    delete Machine;
+    Machine = 0;
+  } else {
+    Slot = -1;
   }
 
   if (Slot != -1)
@@ -1081,7 +1108,7 @@ class AssemblyWriter {
   const Module *TheModule;
   TypePrinting TypePrinter;
   AssemblyAnnotationWriter *AnnotationWriter;
-  
+
 public:
   inline AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
                         const Module *M,
@@ -1093,11 +1120,12 @@ public:
 
   void printMDNodeBody(const MDNode *MD);
   void printNamedMDNode(const NamedMDNode *NMD);
-  
+
   void printModule(const Module *M);
 
   void writeOperand(const Value *Op, bool PrintType);
   void writeParamOperand(const Value *Operand, Attributes Attrs);
+  void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
 
   void writeAllMDNodes();
 
@@ -1128,6 +1156,28 @@ void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
   WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
 }
 
+void AssemblyWriter::writeAtomic(AtomicOrdering Ordering,
+                                 SynchronizationScope SynchScope) {
+  if (Ordering == NotAtomic)
+    return;
+
+  switch (SynchScope) {
+  default: Out << " <bad scope " << int(SynchScope) << ">"; break;
+  case SingleThread: Out << " singlethread"; break;
+  case CrossThread: break;
+  }
+
+  switch (Ordering) {
+  default: Out << " <bad ordering " << int(Ordering) << ">"; break;
+  case Unordered: Out << " unordered"; break;
+  case Monotonic: Out << " monotonic"; break;
+  case Acquire: Out << " acquire"; break;
+  case Release: Out << " release"; break;
+  case AcquireRelease: Out << " acq_rel"; break;
+  case SequentiallyConsistent: Out << " seq_cst"; break;
+  }
+}
+
 void AssemblyWriter::writeParamOperand(const Value *Operand,
                                        Attributes Attrs) {
   if (Operand == 0) {
@@ -1216,7 +1266,7 @@ void AssemblyWriter::printModule(const Module *M) {
 
   // Output named metadata.
   if (!M->named_metadata_empty()) Out << '\n';
-  
+
   for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
        E = M->named_metadata_end(); I != E; ++I)
     printNamedMDNode(I);
@@ -1357,26 +1407,8 @@ void AssemblyWriter::printAlias(const GlobalAlias *GA) {
   if (Aliasee == 0) {
     TypePrinter.print(GA->getType(), Out);
     Out << " <<NULL ALIASEE>>";
-  } else if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Aliasee)) {
-    TypePrinter.print(GV->getType(), Out);
-    Out << ' ';
-    PrintLLVMName(Out, GV);
-  } else if (const Function *F = dyn_cast<Function>(Aliasee)) {
-    TypePrinter.print(F->getFunctionType(), Out);
-    Out << "* ";
-
-    WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
-  } else if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(Aliasee)) {
-    TypePrinter.print(GA->getType(), Out);
-    Out << ' ';
-    PrintLLVMName(Out, GA);
   } else {
-    const ConstantExpr *CE = cast<ConstantExpr>(Aliasee);
-    // The only valid GEP is an all zero GEP.
-    assert((CE->getOpcode() == Instruction::BitCast ||
-            CE->getOpcode() == Instruction::GetElementPtr) &&
-           "Unsupported aliasee");
-    writeOperand(CE, false);
+    writeOperand(Aliasee, !isa<ConstantExpr>(Aliasee));
   }
 
   printInfoComment(*GA);
@@ -1387,29 +1419,29 @@ void AssemblyWriter::printTypeIdentities() {
   if (TypePrinter.NumberedTypes.empty() &&
       TypePrinter.NamedTypes.empty())
     return;
-  
+
   Out << '\n';
-  
+
   // We know all the numbers that each type is used and we know that it is a
   // dense assignment.  Convert the map to an index table.
   std::vector<StructType*> NumberedTypes(TypePrinter.NumberedTypes.size());
-  for (DenseMap<StructType*, unsigned>::iterator I = 
+  for (DenseMap<StructType*, unsigned>::iterator I =
        TypePrinter.NumberedTypes.begin(), E = TypePrinter.NumberedTypes.end();
        I != E; ++I) {
     assert(I->second < NumberedTypes.size() && "Didn't get a dense numbering?");
     NumberedTypes[I->second] = I->first;
   }
-           
+
   // Emit all numbered types.
   for (unsigned i = 0, e = NumberedTypes.size(); i != e; ++i) {
     Out << '%' << i << " = type ";
-    
+
     // Make sure we print out at least one level of the type structure, so
     // that we do not get %2 = type %2
     TypePrinter.printStructBody(NumberedTypes[i], Out);
     Out << '\n';
   }
-  
+
   for (unsigned i = 0, e = TypePrinter.NamedTypes.size(); i != e; ++i) {
     PrintLLVMName(Out, TypePrinter.NamedTypes[i]->getName(), LocalPrefix);
     Out << " = type ";
@@ -1457,7 +1489,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   default: Out << "cc" << F->getCallingConv() << " "; break;
   }
 
-  const FunctionType *FT = F->getFunctionType();
+  FunctionType *FT = F->getFunctionType();
   const AttrListPtr &Attrs = F->getAttributes();
   Attributes RetAttrs = Attrs.getRetAttributes();
   if (RetAttrs != Attribute::None)
@@ -1628,18 +1660,24 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << '%' << SlotNum << " = ";
   }
 
-  // If this is a volatile load or store, print out the volatile marker.
-  if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isVolatile()) ||
-      (isa<StoreInst>(I) && cast<StoreInst>(I).isVolatile())) {
-      Out << "volatile ";
-  } else if (isa<CallInst>(I) && cast<CallInst>(I).isTailCall()) {
-    // If this is a call, check if it's a tail call.
+  if (isa<CallInst>(I) && cast<CallInst>(I).isTailCall())
     Out << "tail ";
-  }
 
   // Print out the opcode...
   Out << I.getOpcodeName();
 
+  // If this is an atomic load or store, print out the atomic marker.
+  if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isAtomic()) ||
+      (isa<StoreInst>(I) && cast<StoreInst>(I).isAtomic()))
+    Out << " atomic";
+
+  // If this is a volatile operation, print out the volatile marker.
+  if ((isa<LoadInst>(I)  && cast<LoadInst>(I).isVolatile()) ||
+      (isa<StoreInst>(I) && cast<StoreInst>(I).isVolatile()) ||
+      (isa<AtomicCmpXchgInst>(I) && cast<AtomicCmpXchgInst>(I).isVolatile()) ||
+      (isa<AtomicRMWInst>(I) && cast<AtomicRMWInst>(I).isVolatile()))
+    Out << " volatile";
+
   // Print out optimization information.
   WriteOptimizationInfo(Out, &I);
 
@@ -1647,6 +1685,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
   if (const CmpInst *CI = dyn_cast<CmpInst>(&I))
     Out << ' ' << getPredicateText(CI->getPredicate());
 
+  // Print out the atomicrmw operation
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I))
+    writeAtomicRMWOperation(Out, RMWI->getOperation());
+
   // Print out the type of the operands...
   const Value *Operand = I.getNumOperands() ? I.getOperand(0) : 0;
 
@@ -1661,18 +1703,20 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(BI.getSuccessor(1), true);
 
   } else if (isa<SwitchInst>(I)) {
+    SwitchInst& SI(cast<SwitchInst>(I));
     // Special case switch instruction to get formatting nice and correct.
     Out << ' ';
-    writeOperand(Operand        , true);
+    writeOperand(SI.getCondition(), true);
     Out << ", ";
-    writeOperand(I.getOperand(1), true);
+    writeOperand(SI.getDefaultDest(), true);
     Out << " [";
-
-    for (unsigned op = 2, Eop = I.getNumOperands(); op < Eop; op += 2) {
+    // Skip the first item since that's the default case.
+    unsigned NumCases = SI.getNumCases();
+    for (unsigned i = 1; i < NumCases; ++i) {
       Out << "\n    ";
-      writeOperand(I.getOperand(op  ), true);
+      writeOperand(SI.getCaseValue(i), true);
       Out << ", ";
-      writeOperand(I.getOperand(op+1), true);
+      writeOperand(SI.getSuccessor(i), true);
     }
     Out << "\n  ]";
   } else if (isa<IndirectBrInst>(I)) {
@@ -1680,7 +1724,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, true);
     Out << ", [";
-    
+
     for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i) {
       if (i != 1)
         Out << ", ";
@@ -1709,6 +1753,24 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(I.getOperand(1), true);
     for (const unsigned *i = IVI->idx_begin(), *e = IVI->idx_end(); i != e; ++i)
       Out << ", " << *i;
+  } else if (const LandingPadInst *LPI = dyn_cast<LandingPadInst>(&I)) {
+    Out << ' ';
+    TypePrinter.print(I.getType(), Out);
+    Out << " personality ";
+    writeOperand(I.getOperand(0), true); Out << '\n';
+
+    if (LPI->isCleanup())
+      Out << "          cleanup";
+
+    for (unsigned i = 0, e = LPI->getNumClauses(); i != e; ++i) {
+      if (i != 0 || LPI->isCleanup()) Out << "\n";
+      if (LPI->isCatch(i))
+        Out << "          catch ";
+      else
+        Out << "          filter ";
+
+      writeOperand(LPI->getClause(i), true);
+    }
   } else if (isa<ReturnInst>(I) && !Operand) {
     Out << " void";
   } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
@@ -1878,11 +1940,23 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
   }
 
-  // Print post operand alignment for load/store.
-  if (isa<LoadInst>(I) && cast<LoadInst>(I).getAlignment()) {
-    Out << ", align " << cast<LoadInst>(I).getAlignment();
-  } else if (isa<StoreInst>(I) && cast<StoreInst>(I).getAlignment()) {
-    Out << ", align " << cast<StoreInst>(I).getAlignment();
+  // Print atomic ordering/alignment for memory operations
+  if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (LI->isAtomic())
+      writeAtomic(LI->getOrdering(), LI->getSynchScope());
+    if (LI->getAlignment())
+      Out << ", align " << LI->getAlignment();
+  } else if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+    if (SI->isAtomic())
+      writeAtomic(SI->getOrdering(), SI->getSynchScope());
+    if (SI->getAlignment())
+      Out << ", align " << SI->getAlignment();
+  } else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+    writeAtomic(CXI->getOrdering(), CXI->getSynchScope());
+  } else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&I)) {
+    writeAtomic(RMWI->getOrdering(), RMWI->getSynchScope());
+  } else if (const FenceInst *FI = dyn_cast<FenceInst>(&I)) {
+    writeAtomic(FI->getOrdering(), FI->getSynchScope());
   }
 
   // Print Metadata info.
@@ -1916,7 +1990,7 @@ static void WriteMDNodeComment(const MDNode *Node,
   APInt Tag = Val & ~APInt(Val.getBitWidth(), LLVMDebugVersionMask);
   if (Val.ult(LLVMDebugVersion))
     return;
-  
+
   Out.PadToColumn(50);
   if (Tag == dwarf::DW_TAG_user_base)
     Out << "; [ DW_TAG_user_base ]";
@@ -1932,7 +2006,7 @@ void AssemblyWriter::writeAllMDNodes() {
   for (SlotTracker::mdn_iterator I = Machine.mdn_begin(), E = Machine.mdn_end();
        I != E; ++I)
     Nodes[I->second] = cast<MDNode>(I->first);
-  
+
   for (unsigned i = 0, e = Nodes.size(); i != e; ++i) {
     Out << '!' << i << " = metadata ";
     printMDNodeBody(Nodes[i]);
@@ -1970,10 +2044,10 @@ void Type::print(raw_ostream &OS) const {
   }
   TypePrinting TP;
   TP.print(const_cast<Type*>(this), OS);
-  
+
   // If the type is a named struct type, print the body as well.
   if (StructType *STy = dyn_cast<StructType>(const_cast<Type*>(this)))
-    if (!STy->isAnonymous()) {
+    if (!STy->isLiteral()) {
       OS << " = type ";
       TP.printStructBody(STy, OS);
     }
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index bf6efa1..485be75 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -38,6 +38,8 @@ std::string Attribute::getAsString(Attributes Attrs) {
     Result += "nounwind ";
   if (Attrs & Attribute::UWTable)
     Result += "uwtable ";
+  if (Attrs & Attribute::ReturnsTwice)
+    Result += "returns_twice ";
   if (Attrs & Attribute::InReg)
     Result += "inreg ";
   if (Attrs & Attribute::NoAlias)
@@ -72,8 +74,6 @@ std::string Attribute::getAsString(Attributes Attrs) {
     Result += "noimplicitfloat ";
   if (Attrs & Attribute::Naked)
     Result += "naked ";
-  if (Attrs & Attribute::Hotpatch)
-    Result += "hotpatch ";
   if (Attrs & Attribute::NonLazyBind)
     Result += "nonlazybind ";
   if (Attrs & Attribute::StackAlignment) {
@@ -92,7 +92,7 @@ std::string Attribute::getAsString(Attributes Attrs) {
   return Result;
 }
 
-Attributes Attribute::typeIncompatible(const Type *Ty) {
+Attributes Attribute::typeIncompatible(Type *Ty) {
   Attributes Incompatible = None;
   
   if (!Ty->isIntegerTy())
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 9e93ff3..b849d3e 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -14,11 +14,15 @@
 #include "llvm/AutoUpgrade.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/Instruction.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/IRBuilder.h"
 #include <cstring>
@@ -34,11 +38,48 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     return false;
   Name = Name.substr(5); // Strip off "llvm."
 
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
   Module *M = F->getParent();
   
   switch (Name[0]) {
   default: break;
+  case 'a':
+    if (Name.startswith("atomic.cmp.swap") ||
+        Name.startswith("atomic.swap") ||
+        Name.startswith("atomic.load.add") ||
+        Name.startswith("atomic.load.sub") ||
+        Name.startswith("atomic.load.and") ||
+        Name.startswith("atomic.load.nand") ||
+        Name.startswith("atomic.load.or") ||
+        Name.startswith("atomic.load.xor") ||
+        Name.startswith("atomic.load.max") ||
+        Name.startswith("atomic.load.min") ||
+        Name.startswith("atomic.load.umax") ||
+        Name.startswith("atomic.load.umin"))
+      return true;
+  case 'i':
+    //  This upgrades the old llvm.init.trampoline to the new
+    //  llvm.init.trampoline and llvm.adjust.trampoline pair.
+    if (Name == "init.trampoline") {
+      // The new llvm.init.trampoline returns nothing.
+      if (FTy->getReturnType()->isVoidTy())
+        break;
+
+      assert(FTy->getNumParams() == 3 && "old init.trampoline takes 3 args!");
+
+      // Change the name of the old intrinsic so that we can play with its type.
+      std::string NameTmp = F->getName();
+      F->setName("");
+      NewFn = cast<Function>(M->getOrInsertFunction(
+                               NameTmp,
+                               Type::getVoidTy(M->getContext()),
+                               FTy->getParamType(0), FTy->getParamType(1),
+                               FTy->getParamType(2), (Type *)0));
+      return true;
+    }
+  case 'm':
+    if (Name == "memory.barrier")
+      return true;
   case 'p':
     //  This upgrades the llvm.prefetch intrinsic to accept one more parameter,
     //  which is a instruction / data cache identifier. The old version only
@@ -139,8 +180,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         F->getName() == "llvm.x86.sse2.loadu.dq" ||
         F->getName() == "llvm.x86.sse2.loadu.pd") {
       // Convert to a native, unaligned load.
-      const Type *VecTy = CI->getType();
-      const Type *IntTy = IntegerType::get(C, 128);
+      Type *VecTy = CI->getType();
+      Type *IntTy = IntegerType::get(C, 128);
       IRBuilder<> Builder(C);
       Builder.SetInsertPoint(CI->getParent(), CI);
 
@@ -182,6 +223,80 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       // Remove intrinsic.
       CI->eraseFromParent();
+    } else if (F->getName().startswith("llvm.atomic.cmp.swap")) {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+      Value *Val = Builder.CreateAtomicCmpXchg(CI->getArgOperand(0),
+                                               CI->getArgOperand(1),
+                                               CI->getArgOperand(2),
+                                               Monotonic);
+
+      // Replace intrinsic.
+      Val->takeName(CI);
+      if (!CI->use_empty())
+        CI->replaceAllUsesWith(Val);
+      CI->eraseFromParent();
+    } else if (F->getName().startswith("llvm.atomic")) {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      AtomicRMWInst::BinOp Op;
+      if (F->getName().startswith("llvm.atomic.swap"))
+        Op = AtomicRMWInst::Xchg;
+      else if (F->getName().startswith("llvm.atomic.load.add"))
+        Op = AtomicRMWInst::Add;
+      else if (F->getName().startswith("llvm.atomic.load.sub"))
+        Op = AtomicRMWInst::Sub;
+      else if (F->getName().startswith("llvm.atomic.load.and"))
+        Op = AtomicRMWInst::And;
+      else if (F->getName().startswith("llvm.atomic.load.nand"))
+        Op = AtomicRMWInst::Nand;
+      else if (F->getName().startswith("llvm.atomic.load.or"))
+        Op = AtomicRMWInst::Or;
+      else if (F->getName().startswith("llvm.atomic.load.xor"))
+        Op = AtomicRMWInst::Xor;
+      else if (F->getName().startswith("llvm.atomic.load.max"))
+        Op = AtomicRMWInst::Max;
+      else if (F->getName().startswith("llvm.atomic.load.min"))
+        Op = AtomicRMWInst::Min;
+      else if (F->getName().startswith("llvm.atomic.load.umax"))
+        Op = AtomicRMWInst::UMax;
+      else if (F->getName().startswith("llvm.atomic.load.umin"))
+        Op = AtomicRMWInst::UMin;
+      else
+        llvm_unreachable("Unknown atomic");
+
+      Value *Val = Builder.CreateAtomicRMW(Op, CI->getArgOperand(0),
+                                           CI->getArgOperand(1),
+                                           Monotonic);
+
+      // Replace intrinsic.
+      Val->takeName(CI);
+      if (!CI->use_empty())
+        CI->replaceAllUsesWith(Val);
+      CI->eraseFromParent();
+    } else if (F->getName() == "llvm.memory.barrier") {
+      IRBuilder<> Builder(C);
+      Builder.SetInsertPoint(CI->getParent(), CI);
+
+      // Note that this conversion ignores the "device" bit; it was not really
+      // well-defined, and got abused because nobody paid enough attention to
+      // get it right. In practice, this probably doesn't matter; application
+      // code generally doesn't need anything stronger than
+      // SequentiallyConsistent (and realistically, SequentiallyConsistent
+      // is lowered to a strong enough barrier for almost anything).
+
+      if (cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue())
+        Builder.CreateFence(SequentiallyConsistent);
+      else if (!cast<ConstantInt>(CI->getArgOperand(0))->getZExtValue())
+        Builder.CreateFence(Release);
+      else if (!cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue())
+        Builder.CreateFence(Acquire);
+      else
+        Builder.CreateFence(AcquireRelease);
+
+      // Remove intrinsic.
+      CI->eraseFromParent();
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
@@ -192,7 +307,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::prefetch: {
     IRBuilder<> Builder(C);
     Builder.SetInsertPoint(CI->getParent(), CI);
-    const llvm::Type *I32Ty = llvm::Type::getInt32Ty(CI->getContext());
+    llvm::Type *I32Ty = llvm::Type::getInt32Ty(CI->getContext());
 
     // Add the extra "data cache" argument
     Value *Operands[4] = { CI->getArgOperand(0), CI->getArgOperand(1),
@@ -212,6 +327,32 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     break;
   }
+  case Intrinsic::init_trampoline: {
+
+    //  Transform
+    //    %tramp = call i8* llvm.init.trampoline (i8* x, i8* y, i8* z)
+    //  to
+    //    call void llvm.init.trampoline (i8* %x, i8* %y, i8* %z)
+    //    %tramp = call i8* llvm.adjust.trampoline (i8* %x)
+
+    Function *AdjustTrampolineFn =
+      cast<Function>(Intrinsic::getDeclaration(F->getParent(),
+                                               Intrinsic::adjust_trampoline));
+
+    IRBuilder<> Builder(C);
+    Builder.SetInsertPoint(CI);
+
+    Builder.CreateCall3(NewFn, CI->getArgOperand(0), CI->getArgOperand(1),
+                        CI->getArgOperand(2));
+
+    CallInst *AdjustCall = Builder.CreateCall(AdjustTrampolineFn,
+                                              CI->getArgOperand(0),
+                                              CI->getName());
+    if (!CI->use_empty())
+      CI->replaceAllUsesWith(AdjustCall);
+    CI->eraseFromParent();
+    break;
+  }
   }
 }
 
@@ -279,3 +420,249 @@ void llvm::CheckDebugInfoIntrinsics(Module *M) {
     }
   }
 }
+
+/// FindExnAndSelIntrinsics - Find the eh_exception and eh_selector intrinsic
+/// calls reachable from the unwind basic block.
+static void FindExnAndSelIntrinsics(BasicBlock *BB, CallInst *&Exn,
+                                    CallInst *&Sel,
+                                    SmallPtrSet<BasicBlock*, 8> &Visited) {
+  if (!Visited.insert(BB)) return;
+
+  for (BasicBlock::iterator
+         I = BB->begin(), E = BB->end(); I != E; ++I) {
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      switch (CI->getCalledFunction()->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::eh_exception:
+        assert(!Exn && "Found more than one eh.exception call!");
+        Exn = CI;
+        break;
+      case Intrinsic::eh_selector:
+        assert(!Sel && "Found more than one eh.selector call!");
+        Sel = CI;
+        break;
+      }
+
+      if (Exn && Sel) return;
+    }
+  }
+
+  if (Exn && Sel) return;
+
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    FindExnAndSelIntrinsics(*I, Exn, Sel, Visited);
+    if (Exn && Sel) return;
+  }
+}
+
+/// TransferClausesToLandingPadInst - Transfer the exception handling clauses
+/// from the eh_selector call to the new landingpad instruction.
+static void TransferClausesToLandingPadInst(LandingPadInst *LPI,
+                                            CallInst *EHSel) {
+  LLVMContext &Context = LPI->getContext();
+  unsigned N = EHSel->getNumArgOperands();
+
+  for (unsigned i = N - 1; i > 1; --i) {
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(EHSel->getArgOperand(i))){
+      unsigned FilterLength = CI->getZExtValue();
+      unsigned FirstCatch = i + FilterLength + !FilterLength;
+      assert(FirstCatch <= N && "Invalid filter length");
+
+      if (FirstCatch < N)
+        for (unsigned j = FirstCatch; j < N; ++j) {
+          Value *Val = EHSel->getArgOperand(j);
+          if (!Val->hasName() || Val->getName() != "llvm.eh.catch.all.value") {
+            LPI->addClause(EHSel->getArgOperand(j));
+          } else {
+            GlobalVariable *GV = cast<GlobalVariable>(Val);
+            LPI->addClause(GV->getInitializer());
+          }
+        }
+
+      if (!FilterLength) {
+        // Cleanup.
+        LPI->setCleanup(true);
+      } else {
+        // Filter.
+        SmallVector<Constant *, 4> TyInfo;
+        TyInfo.reserve(FilterLength - 1);
+        for (unsigned j = i + 1; j < FirstCatch; ++j)
+          TyInfo.push_back(cast<Constant>(EHSel->getArgOperand(j)));
+        ArrayType *AType =
+          ArrayType::get(!TyInfo.empty() ? TyInfo[0]->getType() :
+                         PointerType::getUnqual(Type::getInt8Ty(Context)),
+                         TyInfo.size());
+        LPI->addClause(ConstantArray::get(AType, TyInfo));
+      }
+
+      N = i;
+    }
+  }
+
+  if (N > 2)
+    for (unsigned j = 2; j < N; ++j) {
+      Value *Val = EHSel->getArgOperand(j);
+      if (!Val->hasName() || Val->getName() != "llvm.eh.catch.all.value") {
+        LPI->addClause(EHSel->getArgOperand(j));
+      } else {
+        GlobalVariable *GV = cast<GlobalVariable>(Val);
+        LPI->addClause(GV->getInitializer());
+      }
+    }
+}
+
+/// This function upgrades the old pre-3.0 exception handling system to the new
+/// one. N.B. This will be removed in 3.1.
+void llvm::UpgradeExceptionHandling(Module *M) {
+  Function *EHException = M->getFunction("llvm.eh.exception");
+  Function *EHSelector = M->getFunction("llvm.eh.selector");
+  if (!EHException || !EHSelector)
+    return;
+
+  LLVMContext &Context = M->getContext();
+  Type *ExnTy = PointerType::getUnqual(Type::getInt8Ty(Context));
+  Type *SelTy = Type::getInt32Ty(Context);
+  Type *LPadSlotTy = StructType::get(ExnTy, SelTy, NULL);
+
+  // This map links the invoke instruction with the eh.exception and eh.selector
+  // calls associated with it.
+  DenseMap<InvokeInst*, std::pair<Value*, Value*> > InvokeToIntrinsicsMap;
+  for (Module::iterator
+         I = M->begin(), E = M->end(); I != E; ++I) {
+    Function &F = *I;
+
+    for (Function::iterator
+           II = F.begin(), IE = F.end(); II != IE; ++II) {
+      BasicBlock *BB = &*II;
+      InvokeInst *Inst = dyn_cast<InvokeInst>(BB->getTerminator());
+      if (!Inst) continue;
+      BasicBlock *UnwindDest = Inst->getUnwindDest();
+      if (UnwindDest->isLandingPad()) continue; // Already converted.
+
+      SmallPtrSet<BasicBlock*, 8> Visited;
+      CallInst *Exn = 0;
+      CallInst *Sel = 0;
+      FindExnAndSelIntrinsics(UnwindDest, Exn, Sel, Visited);
+      assert(Exn && Sel && "Cannot find eh.exception and eh.selector calls!");
+      InvokeToIntrinsicsMap[Inst] = std::make_pair(Exn, Sel);
+    }
+  }
+
+  // This map stores the slots where the exception object and selector value are
+  // stored within a function.
+  DenseMap<Function*, std::pair<Value*, Value*> > FnToLPadSlotMap;
+  SmallPtrSet<Instruction*, 32> DeadInsts;
+  for (DenseMap<InvokeInst*, std::pair<Value*, Value*> >::iterator
+         I = InvokeToIntrinsicsMap.begin(), E = InvokeToIntrinsicsMap.end();
+       I != E; ++I) {
+    InvokeInst *Invoke = I->first;
+    BasicBlock *UnwindDest = Invoke->getUnwindDest();
+    Function *F = UnwindDest->getParent();
+    std::pair<Value*, Value*> EHIntrinsics = I->second;
+    CallInst *Exn = cast<CallInst>(EHIntrinsics.first);
+    CallInst *Sel = cast<CallInst>(EHIntrinsics.second);
+
+    // Store the exception object and selector value in the entry block.
+    Value *ExnSlot = 0;
+    Value *SelSlot = 0;
+    if (!FnToLPadSlotMap[F].first) {
+      BasicBlock *Entry = &F->front();
+      ExnSlot = new AllocaInst(ExnTy, "exn", Entry->getTerminator());
+      SelSlot = new AllocaInst(SelTy, "sel", Entry->getTerminator());
+      FnToLPadSlotMap[F] = std::make_pair(ExnSlot, SelSlot);
+    } else {
+      ExnSlot = FnToLPadSlotMap[F].first;
+      SelSlot = FnToLPadSlotMap[F].second;
+    }
+
+    if (!UnwindDest->getSinglePredecessor()) {
+      // The unwind destination doesn't have a single predecessor. Create an
+      // unwind destination which has only one predecessor.
+      BasicBlock *NewBB = BasicBlock::Create(Context, "new.lpad",
+                                             UnwindDest->getParent());
+      BranchInst::Create(UnwindDest, NewBB);
+      Invoke->setUnwindDest(NewBB);
+
+      // Fix up any PHIs in the original unwind destination block.
+      for (BasicBlock::iterator
+             II = UnwindDest->begin(); isa<PHINode>(II); ++II) {
+        PHINode *PN = cast<PHINode>(II);
+        int Idx = PN->getBasicBlockIndex(Invoke->getParent());
+        if (Idx == -1) continue;
+        PN->setIncomingBlock(Idx, NewBB);
+      }
+
+      UnwindDest = NewBB;
+    }
+
+    IRBuilder<> Builder(Context);
+    Builder.SetInsertPoint(UnwindDest, UnwindDest->getFirstInsertionPt());
+
+    Value *PersFn = Sel->getArgOperand(1);
+    LandingPadInst *LPI = Builder.CreateLandingPad(LPadSlotTy, PersFn, 0);
+    Value *LPExn = Builder.CreateExtractValue(LPI, 0);
+    Value *LPSel = Builder.CreateExtractValue(LPI, 1);
+    Builder.CreateStore(LPExn, ExnSlot);
+    Builder.CreateStore(LPSel, SelSlot);
+
+    TransferClausesToLandingPadInst(LPI, Sel);
+
+    DeadInsts.insert(Exn);
+    DeadInsts.insert(Sel);
+  }
+
+  // Replace the old intrinsic calls with the values from the landingpad
+  // instruction(s). These values were stored in allocas for us to use here.
+  for (DenseMap<InvokeInst*, std::pair<Value*, Value*> >::iterator
+         I = InvokeToIntrinsicsMap.begin(), E = InvokeToIntrinsicsMap.end();
+       I != E; ++I) {
+    std::pair<Value*, Value*> EHIntrinsics = I->second;
+    CallInst *Exn = cast<CallInst>(EHIntrinsics.first);
+    CallInst *Sel = cast<CallInst>(EHIntrinsics.second);
+    BasicBlock *Parent = Exn->getParent();
+
+    std::pair<Value*,Value*> ExnSelSlots = FnToLPadSlotMap[Parent->getParent()];
+
+    IRBuilder<> Builder(Context);
+    Builder.SetInsertPoint(Parent, Exn);
+    LoadInst *LPExn = Builder.CreateLoad(ExnSelSlots.first, "exn.load");
+    LoadInst *LPSel = Builder.CreateLoad(ExnSelSlots.second, "sel.load");
+
+    Exn->replaceAllUsesWith(LPExn);
+    Sel->replaceAllUsesWith(LPSel);
+  }
+
+  // Remove the dead instructions.
+  for (SmallPtrSet<Instruction*, 32>::iterator
+         I = DeadInsts.begin(), E = DeadInsts.end(); I != E; ++I) {
+    Instruction *Inst = *I;
+    Inst->eraseFromParent();
+  }
+
+  // Replace calls to "llvm.eh.resume" with the 'resume' instruction. Load the
+  // exception and selector values from the stored place.
+  Function *EHResume = M->getFunction("llvm.eh.resume");
+  if (!EHResume) return;
+
+  while (!EHResume->use_empty()) {
+    CallInst *Resume = cast<CallInst>(EHResume->use_back());
+    BasicBlock *BB = Resume->getParent();
+
+    IRBuilder<> Builder(Context);
+    Builder.SetInsertPoint(BB, Resume);
+
+    Value *LPadVal =
+      Builder.CreateInsertValue(UndefValue::get(LPadSlotTy),
+                                Resume->getArgOperand(0), 0, "lpad.val");
+    LPadVal = Builder.CreateInsertValue(LPadVal, Resume->getArgOperand(1),
+                                        1, "lpad.val");
+    Builder.CreateResume(LPadVal);
+
+    // Remove all instructions after the 'resume.'
+    BasicBlock::iterator I = Resume;
+    while (I != BB->end()) {
+      Instruction *Inst = &*I++;
+      Inst->eraseFromParent();
+    }
+  }
+}
diff --git a/lib/VMCore/BasicBlock.cpp b/lib/VMCore/BasicBlock.cpp
index 70265c8..d0aa275 100644
--- a/lib/VMCore/BasicBlock.cpp
+++ b/lib/VMCore/BasicBlock.cpp
@@ -53,7 +53,7 @@ BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
   } else if (NewParent) {
     NewParent->getBasicBlockList().push_back(this);
   }
-  
+
   setName(Name);
 }
 
@@ -76,7 +76,7 @@ BasicBlock::~BasicBlock() {
       BA->destroyConstant();
     }
   }
-  
+
   assert(getParent() == 0 && "BasicBlock still linked into the program!");
   dropAllReferences();
   InstList.clear();
@@ -167,6 +167,12 @@ Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
   return &*i;
 }
 
+BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
+  iterator InsertPt = getFirstNonPHI();
+  if (isa<LandingPadInst>(InsertPt)) ++InsertPt;
+  return InsertPt;
+}
+
 void BasicBlock::dropAllReferences() {
   for(iterator I = begin(), E = end(); I != E; ++I)
     I->dropAllReferences();
@@ -184,8 +190,8 @@ BasicBlock *BasicBlock::getSinglePredecessor() {
 
 /// getUniquePredecessor - If this basic block has a unique predecessor block,
 /// return the block, otherwise return a null pointer.
-/// Note that unique predecessor doesn't mean single edge, there can be 
-/// multiple edges from the unique predecessor to this block (for example 
+/// Note that unique predecessor doesn't mean single edge, there can be
+/// multiple edges from the unique predecessor to this block (for example
 /// a switch statement with multiple cases having the same destination).
 BasicBlock *BasicBlock::getUniquePredecessor() {
   pred_iterator PI = pred_begin(this), E = pred_end(this);
@@ -336,11 +342,27 @@ void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
     return;
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
     BasicBlock *Succ = TI->getSuccessor(i);
-    for (iterator II = Succ->begin(); PHINode *PN = dyn_cast<PHINode>(II);
-         ++II) {
+    // N.B. Succ might not be a complete BasicBlock, so don't assume
+    // that it ends with a non-phi instruction.
+    for (iterator II = Succ->begin(), IE = Succ->end(); II != IE; ++II) {
+      PHINode *PN = dyn_cast<PHINode>(II);
+      if (!PN)
+        break;
       int i;
       while ((i = PN->getBasicBlockIndex(this)) >= 0)
         PN->setIncomingBlock(i, New);
     }
   }
 }
+
+/// isLandingPad - Return true if this basic block is a landing pad. I.e., it's
+/// the destination of the 'unwind' edge of an invoke instruction.
+bool BasicBlock::isLandingPad() const {
+  return isa<LandingPadInst>(getFirstNonPHI());
+}
+
+/// getLandingPadInst() - Return the landingpad instruction associated with
+/// the landing pad.
+LandingPadInst *BasicBlock::getLandingPadInst() {
+  return dyn_cast<LandingPadInst>(getFirstNonPHI());
+}
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index f60dd06..0404297 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt
@@ -8,10 +8,11 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
-  DebugLoc.cpp
   DebugInfoProbe.cpp
+  DebugLoc.cpp
   Dominators.cpp
   Function.cpp
+  GCOV.cpp
   GVMaterializer.cpp
   Globals.cpp
   IRBuilder.cpp
@@ -36,3 +37,5 @@ add_llvm_library(LLVMCore
   ValueTypes.cpp
   Verifier.cpp
   )
+
+add_llvm_library_dependencies(LLVMCore LLVMSupport)
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index 323e2a2..30bae71 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -42,7 +42,7 @@ using namespace llvm;
 /// specified vector type.  At this point, we know that the elements of the
 /// input vector constant are all simple integer or FP values.
 static Constant *BitCastConstantVector(ConstantVector *CV,
-                                       const VectorType *DstTy) {
+                                       VectorType *DstTy) {
 
   if (CV->isAllOnesValue()) return Constant::getAllOnesValue(DstTy);
   if (CV->isNullValue()) return Constant::getNullValue(DstTy);
@@ -63,7 +63,7 @@ static Constant *BitCastConstantVector(ConstantVector *CV,
 
   // Bitcast each element now.
   std::vector<Constant*> Result;
-  const Type *DstEltTy = DstTy->getElementType();
+  Type *DstEltTy = DstTy->getElementType();
   for (unsigned i = 0; i != NumElts; ++i)
     Result.push_back(ConstantExpr::getBitCast(CV->getOperand(i),
                                                     DstEltTy));
@@ -78,15 +78,15 @@ static unsigned
 foldConstantCastPair(
   unsigned opc,          ///< opcode of the second cast constant expression
   ConstantExpr *Op,      ///< the first cast constant expression
-  const Type *DstTy      ///< desintation type of the first cast
+  Type *DstTy      ///< desintation type of the first cast
 ) {
   assert(Op && Op->isCast() && "Can't fold cast of cast without a cast!");
   assert(DstTy && DstTy->isFirstClassType() && "Invalid cast destination type");
   assert(CastInst::isCast(opc) && "Invalid cast opcode");
 
   // The the types and opcodes for the two Cast constant expressions
-  const Type *SrcTy = Op->getOperand(0)->getType();
-  const Type *MidTy = Op->getType();
+  Type *SrcTy = Op->getOperand(0)->getType();
+  Type *MidTy = Op->getType();
   Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode());
   Instruction::CastOps secondOp = Instruction::CastOps(opc);
 
@@ -95,27 +95,27 @@ foldConstantCastPair(
                                         Type::getInt64Ty(DstTy->getContext()));
 }
 
-static Constant *FoldBitCast(Constant *V, const Type *DestTy) {
-  const Type *SrcTy = V->getType();
+static Constant *FoldBitCast(Constant *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
   if (SrcTy == DestTy)
     return V; // no-op cast
 
   // Check to see if we are casting a pointer to an aggregate to a pointer to
   // the first element.  If so, return the appropriate GEP instruction.
-  if (const PointerType *PTy = dyn_cast<PointerType>(V->getType()))
-    if (const PointerType *DPTy = dyn_cast<PointerType>(DestTy))
+  if (PointerType *PTy = dyn_cast<PointerType>(V->getType()))
+    if (PointerType *DPTy = dyn_cast<PointerType>(DestTy))
       if (PTy->getAddressSpace() == DPTy->getAddressSpace()) {
         SmallVector<Value*, 8> IdxList;
         Value *Zero =
           Constant::getNullValue(Type::getInt32Ty(DPTy->getContext()));
         IdxList.push_back(Zero);
-        const Type *ElTy = PTy->getElementType();
+        Type *ElTy = PTy->getElementType();
         while (ElTy != DPTy->getElementType()) {
-          if (const StructType *STy = dyn_cast<StructType>(ElTy)) {
+          if (StructType *STy = dyn_cast<StructType>(ElTy)) {
             if (STy->getNumElements() == 0) break;
             ElTy = STy->getElementType(0);
             IdxList.push_back(Zero);
-          } else if (const SequentialType *STy = 
+          } else if (SequentialType *STy = 
                      dyn_cast<SequentialType>(ElTy)) {
             if (ElTy->isPointerTy()) break;  // Can't index into pointers!
             ElTy = STy->getElementType();
@@ -127,14 +127,13 @@ static Constant *FoldBitCast(Constant *V, const Type *DestTy) {
 
         if (ElTy == DPTy->getElementType())
           // This GEP is inbounds because all indices are zero.
-          return ConstantExpr::getInBoundsGetElementPtr(V, &IdxList[0],
-                                                        IdxList.size());
+          return ConstantExpr::getInBoundsGetElementPtr(V, IdxList);
       }
 
   // Handle casts from one vector constant to another.  We know that the src 
   // and dest type have the same size (otherwise its an illegal cast).
-  if (const VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
-    if (const VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
+  if (VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
+    if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
       assert(DestPTy->getBitWidth() == SrcTy->getBitWidth() &&
              "Not cast between same sized vectors!");
       SrcTy = NULL;
@@ -332,15 +331,15 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
 /// return null if no factoring was possible, to avoid endlessly
 /// bouncing an unfoldable expression back into the top-level folder.
 ///
-static Constant *getFoldedSizeOf(const Type *Ty, const Type *DestTy,
+static Constant *getFoldedSizeOf(Type *Ty, Type *DestTy,
                                  bool Folded) {
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Constant *N = ConstantInt::get(DestTy, ATy->getNumElements());
     Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
     return ConstantExpr::getNUWMul(E, N);
   }
 
-  if (const StructType *STy = dyn_cast<StructType>(Ty))
+  if (StructType *STy = dyn_cast<StructType>(Ty))
     if (!STy->isPacked()) {
       unsigned NumElems = STy->getNumElements();
       // An empty struct has size zero.
@@ -364,7 +363,7 @@ static Constant *getFoldedSizeOf(const Type *Ty, const Type *DestTy,
 
   // Pointer size doesn't depend on the pointee type, so canonicalize them
   // to an arbitrary pointee.
-  if (const PointerType *PTy = dyn_cast<PointerType>(Ty))
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
     if (!PTy->getElementType()->isIntegerTy(1))
       return
         getFoldedSizeOf(PointerType::get(IntegerType::get(PTy->getContext(), 1),
@@ -389,11 +388,11 @@ static Constant *getFoldedSizeOf(const Type *Ty, const Type *DestTy,
 /// return null if no factoring was possible, to avoid endlessly
 /// bouncing an unfoldable expression back into the top-level folder.
 ///
-static Constant *getFoldedAlignOf(const Type *Ty, const Type *DestTy,
+static Constant *getFoldedAlignOf(Type *Ty, Type *DestTy,
                                   bool Folded) {
   // The alignment of an array is equal to the alignment of the
   // array element. Note that this is not always true for vectors.
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Constant *C = ConstantExpr::getAlignOf(ATy->getElementType());
     C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                       DestTy,
@@ -402,7 +401,7 @@ static Constant *getFoldedAlignOf(const Type *Ty, const Type *DestTy,
     return C;
   }
 
-  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
     // Packed structs always have an alignment of 1.
     if (STy->isPacked())
       return ConstantInt::get(DestTy, 1);
@@ -429,7 +428,7 @@ static Constant *getFoldedAlignOf(const Type *Ty, const Type *DestTy,
 
   // Pointer alignment doesn't depend on the pointee type, so canonicalize them
   // to an arbitrary pointee.
-  if (const PointerType *PTy = dyn_cast<PointerType>(Ty))
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
     if (!PTy->getElementType()->isIntegerTy(1))
       return
         getFoldedAlignOf(PointerType::get(IntegerType::get(PTy->getContext(),
@@ -455,10 +454,10 @@ static Constant *getFoldedAlignOf(const Type *Ty, const Type *DestTy,
 /// return null if no factoring was possible, to avoid endlessly
 /// bouncing an unfoldable expression back into the top-level folder.
 ///
-static Constant *getFoldedOffsetOf(const Type *Ty, Constant *FieldNo,
-                                   const Type *DestTy,
+static Constant *getFoldedOffsetOf(Type *Ty, Constant *FieldNo,
+                                   Type *DestTy,
                                    bool Folded) {
-  if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo, false,
                                                                 DestTy, false),
                                         FieldNo, DestTy);
@@ -466,7 +465,7 @@ static Constant *getFoldedOffsetOf(const Type *Ty, Constant *FieldNo,
     return ConstantExpr::getNUWMul(E, N);
   }
 
-  if (const StructType *STy = dyn_cast<StructType>(Ty))
+  if (StructType *STy = dyn_cast<StructType>(Ty))
     if (!STy->isPacked()) {
       unsigned NumElems = STy->getNumElements();
       // An empty struct has no members.
@@ -506,7 +505,7 @@ static Constant *getFoldedOffsetOf(const Type *Ty, Constant *FieldNo,
 }
 
 Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
-                                            const Type *DestTy) {
+                                            Type *DestTy) {
   if (isa<UndefValue>(V)) {
     // zext(undef) = 0, because the top bits will be zero.
     // sext(undef) = 0, because the top bits will all be the same.
@@ -554,8 +553,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
         cast<VectorType>(DestTy)->getNumElements() ==
         CV->getType()->getNumElements()) {
       std::vector<Constant*> res;
-      const VectorType *DestVecTy = cast<VectorType>(DestTy);
-      const Type *DstEltTy = DestVecTy->getElementType();
+      VectorType *DestVecTy = cast<VectorType>(DestTy);
+      Type *DstEltTy = DestVecTy->getElementType();
       for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
         res.push_back(ConstantExpr::getCast(opc,
                                             CV->getOperand(i), DstEltTy));
@@ -590,7 +589,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
       uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
       (void) V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
                                 APFloat::rmTowardZero, &ignored);
-      APInt Val(DestBitWidth, 2, x);
+      APInt Val(DestBitWidth, x);
       return ConstantInt::get(FPC->getContext(), Val);
     }
     return 0; // Can't fold.
@@ -608,7 +607,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
       if (CE->getOpcode() == Instruction::GetElementPtr &&
           CE->getOperand(0)->isNullValue()) {
-        const Type *Ty =
+        Type *Ty =
           cast<PointerType>(CE->getOperand(0)->getType())->getElementType();
         if (CE->getNumOperands() == 2) {
           // Handle a sizeof-like expression.
@@ -623,7 +622,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
         } else if (CE->getNumOperands() == 3 &&
                    CE->getOperand(1)->isNullValue()) {
           // Handle an alignof-like expression.
-          if (const StructType *STy = dyn_cast<StructType>(Ty))
+          if (StructType *STy = dyn_cast<StructType>(Ty))
             if (!STy->isPacked()) {
               ConstantInt *CI = cast<ConstantInt>(CE->getOperand(2));
               if (CI->isOne() &&
@@ -701,7 +700,7 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
 
     if (CondV->isAllOnesValue()) return V1;
 
-    const VectorType *VTy = cast<VectorType>(V1->getType());
+    VectorType *VTy = cast<VectorType>(V1->getType());
     ConstantVector *CP1 = dyn_cast<ConstantVector>(V1);
     ConstantVector *CP2 = dyn_cast<ConstantVector>(V2);
 
@@ -709,7 +708,7 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
         (CP2 || isa<ConstantAggregateZero>(V2))) {
 
       // Find the element type of the returned vector
-      const Type *EltTy = VTy->getElementType();
+      Type *EltTy = VTy->getElementType();
       unsigned NumElem = VTy->getNumElements();
       std::vector<Constant*> Res(NumElem);
 
@@ -762,10 +761,14 @@ Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
 
   if (ConstantVector *CVal = dyn_cast<ConstantVector>(Val)) {
     if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx)) {
+      uint64_t Index = CIdx->getZExtValue();
+      if (Index >= CVal->getNumOperands())
+        // ee({w,x,y,z}, wrong_value) -> undef
+        return UndefValue::get(cast<VectorType>(Val->getType())->getElementType());
       return CVal->getOperand(CIdx->getZExtValue());
     } else if (isa<UndefValue>(Idx)) {
-      // ee({w,x,y,z}, undef) -> w (an arbitrary value).
-      return CVal->getOperand(0);
+      // ee({w,x,y,z}, undef) -> undef
+      return UndefValue::get(cast<VectorType>(Val->getType())->getElementType());
     }
   }
   return 0;
@@ -834,7 +837,7 @@ static Constant *GetVectorElement(Constant *C, unsigned EltNo) {
   if (ConstantVector *CV = dyn_cast<ConstantVector>(C))
     return CV->getOperand(EltNo);
 
-  const Type *EltTy = cast<VectorType>(C->getType())->getElementType();
+  Type *EltTy = cast<VectorType>(C->getType())->getElementType();
   if (isa<ConstantAggregateZero>(C))
     return Constant::getNullValue(EltTy);
   if (isa<UndefValue>(C))
@@ -850,7 +853,7 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1,
 
   unsigned MaskNumElts = cast<VectorType>(Mask->getType())->getNumElements();
   unsigned SrcNumElts = cast<VectorType>(V1->getType())->getNumElements();
-  const Type *EltTy = cast<VectorType>(V1->getType())->getElementType();
+  Type *EltTy = cast<VectorType>(V1->getType())->getElementType();
 
   // Loop over the shuffle mask, evaluating each element.
   SmallVector<Constant*, 32> Result;
@@ -922,16 +925,16 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
     
     // Otherwise break the aggregate undef into multiple undefs and do
     // the insertion.
-    const CompositeType *AggTy = cast<CompositeType>(Agg->getType());
+    CompositeType *AggTy = cast<CompositeType>(Agg->getType());
     unsigned numOps;
-    if (const ArrayType *AR = dyn_cast<ArrayType>(AggTy))
+    if (ArrayType *AR = dyn_cast<ArrayType>(AggTy))
       numOps = AR->getNumElements();
     else
       numOps = cast<StructType>(AggTy)->getNumElements();
     
     std::vector<Constant*> Ops(numOps); 
     for (unsigned i = 0; i < numOps; ++i) {
-      const Type *MemberTy = AggTy->getTypeAtIndex(i);
+      Type *MemberTy = AggTy->getTypeAtIndex(i);
       Constant *Op =
         (Idxs[0] == i) ?
         ConstantFoldInsertValueInstruction(UndefValue::get(MemberTy),
@@ -940,7 +943,7 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
       Ops[i] = Op;
     }
     
-    if (const StructType* ST = dyn_cast<StructType>(AggTy))
+    if (StructType* ST = dyn_cast<StructType>(AggTy))
       return ConstantStruct::get(ST, Ops);
     return ConstantArray::get(cast<ArrayType>(AggTy), Ops);
   }
@@ -953,16 +956,16 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
     
     // Otherwise break the aggregate zero into multiple zeros and do
     // the insertion.
-    const CompositeType *AggTy = cast<CompositeType>(Agg->getType());
+    CompositeType *AggTy = cast<CompositeType>(Agg->getType());
     unsigned numOps;
-    if (const ArrayType *AR = dyn_cast<ArrayType>(AggTy))
+    if (ArrayType *AR = dyn_cast<ArrayType>(AggTy))
       numOps = AR->getNumElements();
     else
       numOps = cast<StructType>(AggTy)->getNumElements();
     
     std::vector<Constant*> Ops(numOps);
     for (unsigned i = 0; i < numOps; ++i) {
-      const Type *MemberTy = AggTy->getTypeAtIndex(i);
+      Type *MemberTy = AggTy->getTypeAtIndex(i);
       Constant *Op =
         (Idxs[0] == i) ?
         ConstantFoldInsertValueInstruction(Constant::getNullValue(MemberTy),
@@ -971,7 +974,7 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
       Ops[i] = Op;
     }
     
-    if (const StructType *ST = dyn_cast<StructType>(AggTy))
+    if (StructType *ST = dyn_cast<StructType>(AggTy))
       return ConstantStruct::get(ST, Ops);
     return ConstantArray::get(cast<ArrayType>(AggTy), Ops);
   }
@@ -986,7 +989,7 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
       Ops[i] = Op;
     }
     
-    if (const StructType* ST = dyn_cast<StructType>(Agg->getType()))
+    if (StructType* ST = dyn_cast<StructType>(Agg->getType()))
       return ConstantStruct::get(ST, Ops);
     return ConstantArray::get(cast<ArrayType>(Agg->getType()), Ops);
   }
@@ -1265,13 +1268,13 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
         return ConstantFP::get(C1->getContext(), C3V);
       }
     }
-  } else if (const VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
+  } else if (VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
     ConstantVector *CP1 = dyn_cast<ConstantVector>(C1);
     ConstantVector *CP2 = dyn_cast<ConstantVector>(C2);
     if ((CP1 != NULL || isa<ConstantAggregateZero>(C1)) &&
         (CP2 != NULL || isa<ConstantAggregateZero>(C2))) {
       std::vector<Constant*> Res;
-      const Type* EltTy = VTy->getElementType();  
+      Type* EltTy = VTy->getElementType();  
       Constant *C1 = 0;
       Constant *C2 = 0;
       switch (Opcode) {
@@ -1461,8 +1464,8 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
 
 /// isZeroSizedType - This type is zero sized if its an array or structure of
 /// zero sized types.  The only leaf zero sized type is an empty structure.
-static bool isMaybeZeroSizedType(const Type *Ty) {
-  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+static bool isMaybeZeroSizedType(Type *Ty) {
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
     if (STy->isOpaque()) return true;  // Can't say.
 
     // If all of elements have zero size, this does too.
@@ -1470,7 +1473,7 @@ static bool isMaybeZeroSizedType(const Type *Ty) {
       if (!isMaybeZeroSizedType(STy->getElementType(i))) return false;
     return true;
 
-  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     return isMaybeZeroSizedType(ATy->getElementType());
   }
   return false;
@@ -1483,7 +1486,7 @@ static bool isMaybeZeroSizedType(const Type *Ty) {
 /// first is less than the second, return -1, if the second is less than the
 /// first, return 1.  If the constants are not integral, return -2.
 ///
-static int IdxCompare(Constant *C1, Constant *C2, const Type *ElTy) {
+static int IdxCompare(Constant *C1, Constant *C2, Type *ElTy) {
   if (C1 == C2) return 0;
 
   // Ok, we found a different index.  If they are not ConstantInt, we can't do
@@ -1832,8 +1835,8 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
 
 Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred, 
                                                Constant *C1, Constant *C2) {
-  const Type *ResultTy;
-  if (const VectorType *VT = dyn_cast<VectorType>(C1->getType()))
+  Type *ResultTy;
+  if (VectorType *VT = dyn_cast<VectorType>(C1->getType()))
     ResultTy = VectorType::get(Type::getInt1Ty(C1->getContext()),
                                VT->getNumElements());
   else
@@ -2146,9 +2149,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
 /// isInBoundsIndices - Test whether the given sequence of *normalized* indices
 /// is "inbounds".
 template<typename IndexTy>
-static bool isInBoundsIndices(IndexTy const *Idxs, size_t NumIdx) {
+static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
   // No indices means nothing that could be out of bounds.
-  if (NumIdx == 0) return true;
+  if (Idxs.empty()) return true;
 
   // If the first index is zero, it's in bounds.
   if (cast<Constant>(Idxs[0])->isNullValue()) return true;
@@ -2157,7 +2160,7 @@ static bool isInBoundsIndices(IndexTy const *Idxs, size_t NumIdx) {
   // by the one-past-the-end rule.
   if (!cast<ConstantInt>(Idxs[0])->isOne())
     return false;
-  for (unsigned i = 1, e = NumIdx; i != e; ++i)
+  for (unsigned i = 1, e = Idxs.size(); i != e; ++i)
     if (!cast<Constant>(Idxs[i])->isNullValue())
       return false;
   return true;
@@ -2166,31 +2169,29 @@ static bool isInBoundsIndices(IndexTy const *Idxs, size_t NumIdx) {
 template<typename IndexTy>
 static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
                                                bool inBounds,
-                                               IndexTy const *Idxs,
-                                               unsigned NumIdx) {
-  if (NumIdx == 0) return C;
+                                               ArrayRef<IndexTy> Idxs) {
+  if (Idxs.empty()) return C;
   Constant *Idx0 = cast<Constant>(Idxs[0]);
-  if ((NumIdx == 1 && Idx0->isNullValue()))
+  if ((Idxs.size() == 1 && Idx0->isNullValue()))
     return C;
 
   if (isa<UndefValue>(C)) {
-    const PointerType *Ptr = cast<PointerType>(C->getType());
-    const Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs, Idxs+NumIdx);
+    PointerType *Ptr = cast<PointerType>(C->getType());
+    Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs);
     assert(Ty != 0 && "Invalid indices for GEP!");
     return UndefValue::get(PointerType::get(Ty, Ptr->getAddressSpace()));
   }
 
   if (C->isNullValue()) {
     bool isNull = true;
-    for (unsigned i = 0, e = NumIdx; i != e; ++i)
+    for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
       if (!cast<Constant>(Idxs[i])->isNullValue()) {
         isNull = false;
         break;
       }
     if (isNull) {
-      const PointerType *Ptr = cast<PointerType>(C->getType());
-      const Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs,
-                                                         Idxs+NumIdx);
+      PointerType *Ptr = cast<PointerType>(C->getType());
+      Type *Ty = GetElementPtrInst::getIndexedType(Ptr, Idxs);
       assert(Ty != 0 && "Invalid indices for GEP!");
       return ConstantPointerNull::get(PointerType::get(Ty,
                                                        Ptr->getAddressSpace()));
@@ -2203,14 +2204,14 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
     // getelementptr instructions into a single instruction.
     //
     if (CE->getOpcode() == Instruction::GetElementPtr) {
-      const Type *LastTy = 0;
+      Type *LastTy = 0;
       for (gep_type_iterator I = gep_type_begin(CE), E = gep_type_end(CE);
            I != E; ++I)
         LastTy = *I;
 
       if ((LastTy && LastTy->isArrayTy()) || Idx0->isNullValue()) {
         SmallVector<Value*, 16> NewIndices;
-        NewIndices.reserve(NumIdx + CE->getNumOperands());
+        NewIndices.reserve(Idxs.size() + CE->getNumOperands());
         for (unsigned i = 1, e = CE->getNumOperands()-1; i != e; ++i)
           NewIndices.push_back(CE->getOperand(i));
 
@@ -2219,9 +2220,9 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
         Constant *Combined = CE->getOperand(CE->getNumOperands()-1);
         // Otherwise it must be an array.
         if (!Idx0->isNullValue()) {
-          const Type *IdxTy = Combined->getType();
+          Type *IdxTy = Combined->getType();
           if (IdxTy != Idx0->getType()) {
-            const Type *Int64Ty = Type::getInt64Ty(IdxTy->getContext());
+            Type *Int64Ty = Type::getInt64Ty(IdxTy->getContext());
             Constant *C1 = ConstantExpr::getSExtOrBitCast(Idx0, Int64Ty);
             Constant *C2 = ConstantExpr::getSExtOrBitCast(Combined, Int64Ty);
             Combined = ConstantExpr::get(Instruction::Add, C1, C2);
@@ -2232,14 +2233,11 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
         }
 
         NewIndices.push_back(Combined);
-        NewIndices.append(Idxs+1, Idxs+NumIdx);
-        return (inBounds && cast<GEPOperator>(CE)->isInBounds()) ?
-          ConstantExpr::getInBoundsGetElementPtr(CE->getOperand(0),
-                                                 &NewIndices[0],
-                                                 NewIndices.size()) :
-          ConstantExpr::getGetElementPtr(CE->getOperand(0),
-                                         &NewIndices[0],
-                                         NewIndices.size());
+        NewIndices.append(Idxs.begin() + 1, Idxs.end());
+        return
+          ConstantExpr::getGetElementPtr(CE->getOperand(0), NewIndices,
+                                         inBounds &&
+                                           cast<GEPOperator>(CE)->isInBounds());
       }
     }
 
@@ -2248,18 +2246,16 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
     //                        i64 0, i64 0)
     // To: i32* getelementptr ([3 x i32]* %X, i64 0, i64 0)
     //
-    if (CE->isCast() && NumIdx > 1 && Idx0->isNullValue()) {
-      if (const PointerType *SPT =
+    if (CE->isCast() && Idxs.size() > 1 && Idx0->isNullValue()) {
+      if (PointerType *SPT =
           dyn_cast<PointerType>(CE->getOperand(0)->getType()))
-        if (const ArrayType *SAT = dyn_cast<ArrayType>(SPT->getElementType()))
-          if (const ArrayType *CAT =
+        if (ArrayType *SAT = dyn_cast<ArrayType>(SPT->getElementType()))
+          if (ArrayType *CAT =
         dyn_cast<ArrayType>(cast<PointerType>(C->getType())->getElementType()))
             if (CAT->getElementType() == SAT->getElementType())
-              return inBounds ?
-                ConstantExpr::getInBoundsGetElementPtr(
-                      (Constant*)CE->getOperand(0), Idxs, NumIdx) :
-                ConstantExpr::getGetElementPtr(
-                      (Constant*)CE->getOperand(0), Idxs, NumIdx);
+              return
+                ConstantExpr::getGetElementPtr((Constant*)CE->getOperand(0),
+                                               Idxs, inBounds);
     }
   }
 
@@ -2268,19 +2264,19 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   // out into preceding dimensions.
   bool Unknown = false;
   SmallVector<Constant *, 8> NewIdxs;
-  const Type *Ty = C->getType();
-  const Type *Prev = 0;
-  for (unsigned i = 0; i != NumIdx;
+  Type *Ty = C->getType();
+  Type *Prev = 0;
+  for (unsigned i = 0, e = Idxs.size(); i != e;
        Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
-      if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
         if (ATy->getNumElements() <= INT64_MAX &&
             ATy->getNumElements() != 0 &&
             CI->getSExtValue() >= (int64_t)ATy->getNumElements()) {
           if (isa<SequentialType>(Prev)) {
             // It's out of range, but we can factor it into the prior
             // dimension.
-            NewIdxs.resize(NumIdx);
+            NewIdxs.resize(Idxs.size());
             ConstantInt *Factor = ConstantInt::get(CI->getType(),
                                                    ATy->getNumElements());
             NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
@@ -2312,33 +2308,28 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
 
   // If we did any factoring, start over with the adjusted indices.
   if (!NewIdxs.empty()) {
-    for (unsigned i = 0; i != NumIdx; ++i)
+    for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
       if (!NewIdxs[i]) NewIdxs[i] = cast<Constant>(Idxs[i]);
-    return inBounds ?
-      ConstantExpr::getInBoundsGetElementPtr(C, NewIdxs.data(),
-                                             NewIdxs.size()) :
-      ConstantExpr::getGetElementPtr(C, NewIdxs.data(), NewIdxs.size());
+    return ConstantExpr::getGetElementPtr(C, NewIdxs, inBounds);
   }
 
   // If all indices are known integers and normalized, we can do a simple
   // check for the "inbounds" property.
   if (!Unknown && !inBounds &&
-      isa<GlobalVariable>(C) && isInBoundsIndices(Idxs, NumIdx))
-    return ConstantExpr::getInBoundsGetElementPtr(C, Idxs, NumIdx);
+      isa<GlobalVariable>(C) && isInBoundsIndices(Idxs))
+    return ConstantExpr::getInBoundsGetElementPtr(C, Idxs);
 
   return 0;
 }
 
 Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
                                           bool inBounds,
-                                          Constant* const *Idxs,
-                                          unsigned NumIdx) {
-  return ConstantFoldGetElementPtrImpl(C, inBounds, Idxs, NumIdx);
+                                          ArrayRef<Constant *> Idxs) {
+  return ConstantFoldGetElementPtrImpl(C, inBounds, Idxs);
 }
 
 Constant *llvm::ConstantFoldGetElementPtr(Constant *C,
                                           bool inBounds,
-                                          Value* const *Idxs,
-                                          unsigned NumIdx) {
-  return ConstantFoldGetElementPtrImpl(C, inBounds, Idxs, NumIdx);
+                                          ArrayRef<Value *> Idxs) {
+  return ConstantFoldGetElementPtrImpl(C, inBounds, Idxs);
 }
diff --git a/lib/VMCore/ConstantFold.h b/lib/VMCore/ConstantFold.h
index 653a1c3..e12f27a 100644
--- a/lib/VMCore/ConstantFold.h
+++ b/lib/VMCore/ConstantFold.h
@@ -30,7 +30,7 @@ namespace llvm {
   Constant *ConstantFoldCastInstruction(
     unsigned opcode,     ///< The opcode of the cast
     Constant *V,         ///< The source constant
-    const Type *DestTy   ///< The destination type
+    Type *DestTy   ///< The destination type
   );
   Constant *ConstantFoldSelectInstruction(Constant *Cond,
                                           Constant *V1, Constant *V2);
@@ -48,9 +48,9 @@ namespace llvm {
   Constant *ConstantFoldCompareInstruction(unsigned short predicate, 
                                            Constant *C1, Constant *C2);
   Constant *ConstantFoldGetElementPtr(Constant *C, bool inBounds,
-                                      Constant* const *Idxs, unsigned NumIdx);
+                                      ArrayRef<Constant *> Idxs);
   Constant *ConstantFoldGetElementPtr(Constant *C, bool inBounds,
-                                      Value* const *Idxs, unsigned NumIdx);
+                                      ArrayRef<Value *> Idxs);
 } // End llvm namespace
 
 #endif
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index 316c884..a84a046 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -62,8 +62,23 @@ bool Constant::isNullValue() const {
   return isa<ConstantAggregateZero>(this) || isa<ConstantPointerNull>(this);
 }
 
+bool Constant::isAllOnesValue() const {
+  // Check for -1 integers
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
+    return CI->isMinusOne();
+
+  // Check for FP which are bitcasted from -1 integers
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
+    return CFP->getValueAPF().bitcastToAPInt().isAllOnesValue();
+
+  // Check for constant vectors
+  if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
+    return CV->isAllOnesValue();
+
+  return false;
+}
 // Constructor to create a '0' constant of arbitrary type...
-Constant *Constant::getNullValue(const Type *Ty) {
+Constant *Constant::getNullValue(Type *Ty) {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID:
     return ConstantInt::get(Ty, 0);
@@ -90,30 +105,30 @@ Constant *Constant::getNullValue(const Type *Ty) {
     return ConstantAggregateZero::get(Ty);
   default:
     // Function, Label, or Opaque type?
-    assert(!"Cannot create a null constant of that type!");
+    assert(0 && "Cannot create a null constant of that type!");
     return 0;
   }
 }
 
-Constant *Constant::getIntegerValue(const Type *Ty, const APInt &V) {
-  const Type *ScalarTy = Ty->getScalarType();
+Constant *Constant::getIntegerValue(Type *Ty, const APInt &V) {
+  Type *ScalarTy = Ty->getScalarType();
 
   // Create the base integer constant.
   Constant *C = ConstantInt::get(Ty->getContext(), V);
 
   // Convert an integer to a pointer, if necessary.
-  if (const PointerType *PTy = dyn_cast<PointerType>(ScalarTy))
+  if (PointerType *PTy = dyn_cast<PointerType>(ScalarTy))
     C = ConstantExpr::getIntToPtr(C, PTy);
 
   // Broadcast a scalar to a vector, if necessary.
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty))
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
     C = ConstantVector::get(std::vector<Constant *>(VTy->getNumElements(), C));
 
   return C;
 }
 
-Constant *Constant::getAllOnesValue(const Type *Ty) {
-  if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+Constant *Constant::getAllOnesValue(Type *Ty) {
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
     return ConstantInt::get(Ty->getContext(),
                             APInt::getAllOnesValue(ITy->getBitWidth()));
 
@@ -124,9 +139,9 @@ Constant *Constant::getAllOnesValue(const Type *Ty) {
   }
 
   SmallVector<Constant*, 16> Elts;
-  const VectorType *VTy = cast<VectorType>(Ty);
+  VectorType *VTy = cast<VectorType>(Ty);
   Elts.resize(VTy->getNumElements(), getAllOnesValue(VTy->getElementType()));
-  assert(Elts[0] && "Not a vector integer type!");
+  assert(Elts[0] && "Invalid AllOnes value!");
   return cast<ConstantVector>(ConstantVector::get(Elts));
 }
 
@@ -269,7 +284,7 @@ void Constant::getVectorElements(SmallVectorImpl<Constant*> &Elts) const {
     return;
   }
   
-  const VectorType *VT = cast<VectorType>(getType());
+  VectorType *VT = cast<VectorType>(getType());
   if (isa<ConstantAggregateZero>(this)) {
     Elts.assign(VT->getNumElements(), 
                 Constant::getNullValue(VT->getElementType()));
@@ -343,7 +358,7 @@ void Constant::removeDeadConstantUsers() const {
 //                                ConstantInt
 //===----------------------------------------------------------------------===//
 
-ConstantInt::ConstantInt(const IntegerType *Ty, const APInt& V)
+ConstantInt::ConstantInt(IntegerType *Ty, const APInt& V)
   : Constant(Ty, ConstantIntVal, 0, 0), Val(V) {
   assert(V.getBitWidth() == Ty->getBitWidth() && "Invalid constant for type");
 }
@@ -362,8 +377,8 @@ ConstantInt *ConstantInt::getFalse(LLVMContext &Context) {
   return pImpl->TheFalseVal;
 }
 
-Constant *ConstantInt::getTrue(const Type *Ty) {
-  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+Constant *ConstantInt::getTrue(Type *Ty) {
+  VectorType *VTy = dyn_cast<VectorType>(Ty);
   if (!VTy) {
     assert(Ty->isIntegerTy(1) && "True must be i1 or vector of i1.");
     return ConstantInt::getTrue(Ty->getContext());
@@ -375,8 +390,8 @@ Constant *ConstantInt::getTrue(const Type *Ty) {
   return ConstantVector::get(Splat);
 }
 
-Constant *ConstantInt::getFalse(const Type *Ty) {
-  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+Constant *ConstantInt::getFalse(Type *Ty) {
+  VectorType *VTy = dyn_cast<VectorType>(Ty);
   if (!VTy) {
     assert(Ty->isIntegerTy(1) && "False must be i1 or vector of i1.");
     return ConstantInt::getFalse(Ty->getContext());
@@ -396,7 +411,7 @@ Constant *ConstantInt::getFalse(const Type *Ty) {
 // invariant which generates an assertion.
 ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
   // Get the corresponding integer type for the bit width of the value.
-  const IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
+  IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
   // get an existing value or the insertion position
   DenseMapAPIntKeyInfo::KeyTy Key(V, ITy);
   ConstantInt *&Slot = Context.pImpl->IntConstants[Key]; 
@@ -404,44 +419,44 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
   return Slot;
 }
 
-Constant *ConstantInt::get(const Type *Ty, uint64_t V, bool isSigned) {
+Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
   Constant *C = get(cast<IntegerType>(Ty->getScalarType()), V, isSigned);
 
   // For vectors, broadcast the value.
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty))
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
     return ConstantVector::get(SmallVector<Constant*,
                                            16>(VTy->getNumElements(), C));
 
   return C;
 }
 
-ConstantInt* ConstantInt::get(const IntegerType* Ty, uint64_t V, 
+ConstantInt* ConstantInt::get(IntegerType* Ty, uint64_t V, 
                               bool isSigned) {
   return get(Ty->getContext(), APInt(Ty->getBitWidth(), V, isSigned));
 }
 
-ConstantInt* ConstantInt::getSigned(const IntegerType* Ty, int64_t V) {
+ConstantInt* ConstantInt::getSigned(IntegerType* Ty, int64_t V) {
   return get(Ty, V, true);
 }
 
-Constant *ConstantInt::getSigned(const Type *Ty, int64_t V) {
+Constant *ConstantInt::getSigned(Type *Ty, int64_t V) {
   return get(Ty, V, true);
 }
 
-Constant *ConstantInt::get(const Type* Ty, const APInt& V) {
+Constant *ConstantInt::get(Type* Ty, const APInt& V) {
   ConstantInt *C = get(Ty->getContext(), V);
   assert(C->getType() == Ty->getScalarType() &&
          "ConstantInt type doesn't match the type implied by its value!");
 
   // For vectors, broadcast the value.
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty))
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
     return ConstantVector::get(
       SmallVector<Constant *, 16>(VTy->getNumElements(), C));
 
   return C;
 }
 
-ConstantInt* ConstantInt::get(const IntegerType* Ty, StringRef Str,
+ConstantInt* ConstantInt::get(IntegerType* Ty, StringRef Str,
                               uint8_t radix) {
   return get(Ty->getContext(), APInt(Ty->getBitWidth(), Str, radix));
 }
@@ -450,7 +465,7 @@ ConstantInt* ConstantInt::get(const IntegerType* Ty, StringRef Str,
 //                                ConstantFP
 //===----------------------------------------------------------------------===//
 
-static const fltSemantics *TypeToFloatSemantics(const Type *Ty) {
+static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
   if (Ty->isFloatTy())
     return &APFloat::IEEEsingle;
   if (Ty->isDoubleTy())
@@ -467,7 +482,7 @@ static const fltSemantics *TypeToFloatSemantics(const Type *Ty) {
 /// get() - This returns a constant fp for the specified value in the
 /// specified type.  This should only be used for simple constant values like
 /// 2.0/1.0 etc, that are known-valid both as double and as the target format.
-Constant *ConstantFP::get(const Type* Ty, double V) {
+Constant *ConstantFP::get(Type* Ty, double V) {
   LLVMContext &Context = Ty->getContext();
   
   APFloat FV(V);
@@ -477,7 +492,7 @@ Constant *ConstantFP::get(const Type* Ty, double V) {
   Constant *C = get(Context, FV);
 
   // For vectors, broadcast the value.
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty))
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
     return ConstantVector::get(
       SmallVector<Constant *, 16>(VTy->getNumElements(), C));
 
@@ -485,14 +500,14 @@ Constant *ConstantFP::get(const Type* Ty, double V) {
 }
 
 
-Constant *ConstantFP::get(const Type* Ty, StringRef Str) {
+Constant *ConstantFP::get(Type* Ty, StringRef Str) {
   LLVMContext &Context = Ty->getContext();
 
   APFloat FV(*TypeToFloatSemantics(Ty->getScalarType()), Str);
   Constant *C = get(Context, FV);
 
   // For vectors, broadcast the value.
-  if (const VectorType *VTy = dyn_cast<VectorType>(Ty))
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
     return ConstantVector::get(
       SmallVector<Constant *, 16>(VTy->getNumElements(), C));
 
@@ -500,7 +515,7 @@ Constant *ConstantFP::get(const Type* Ty, StringRef Str) {
 }
 
 
-ConstantFP* ConstantFP::getNegativeZero(const Type* Ty) {
+ConstantFP* ConstantFP::getNegativeZero(Type* Ty) {
   LLVMContext &Context = Ty->getContext();
   APFloat apf = cast <ConstantFP>(Constant::getNullValue(Ty))->getValueAPF();
   apf.changeSign();
@@ -508,8 +523,8 @@ ConstantFP* ConstantFP::getNegativeZero(const Type* Ty) {
 }
 
 
-Constant *ConstantFP::getZeroValueForNegation(const Type* Ty) {
-  if (const VectorType *PTy = dyn_cast<VectorType>(Ty))
+Constant *ConstantFP::getZeroValueForNegation(Type* Ty) {
+  if (VectorType *PTy = dyn_cast<VectorType>(Ty))
     if (PTy->getElementType()->isFloatingPointTy()) {
       SmallVector<Constant*, 16> zeros(PTy->getNumElements(),
                            getNegativeZero(PTy->getElementType()));
@@ -532,7 +547,7 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   ConstantFP *&Slot = pImpl->FPConstants[Key];
     
   if (!Slot) {
-    const Type *Ty;
+    Type *Ty;
     if (&V.getSemantics() == &APFloat::IEEEsingle)
       Ty = Type::getFloatTy(Context);
     else if (&V.getSemantics() == &APFloat::IEEEdouble)
@@ -552,13 +567,13 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
   return Slot;
 }
 
-ConstantFP *ConstantFP::getInfinity(const Type *Ty, bool Negative) {
+ConstantFP *ConstantFP::getInfinity(Type *Ty, bool Negative) {
   const fltSemantics &Semantics = *TypeToFloatSemantics(Ty);
   return ConstantFP::get(Ty->getContext(),
                          APFloat::getInf(Semantics, Negative));
 }
 
-ConstantFP::ConstantFP(const Type *Ty, const APFloat& V)
+ConstantFP::ConstantFP(Type *Ty, const APFloat& V)
   : Constant(Ty, ConstantFPVal, 0, 0), Val(V) {
   assert(&V.getSemantics() == TypeToFloatSemantics(Ty) &&
          "FP type Mismatch");
@@ -573,24 +588,19 @@ bool ConstantFP::isExactlyValue(const APFloat &V) const {
 //===----------------------------------------------------------------------===//
 
 
-ConstantArray::ConstantArray(const ArrayType *T,
-                             const std::vector<Constant*> &V)
+ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
   : Constant(T, ConstantArrayVal,
              OperandTraits<ConstantArray>::op_end(this) - V.size(),
              V.size()) {
   assert(V.size() == T->getNumElements() &&
          "Invalid initializer vector for constant array");
-  Use *OL = OperandList;
-  for (std::vector<Constant*>::const_iterator I = V.begin(), E = V.end();
-       I != E; ++I, ++OL) {
-    Constant *C = *I;
-    assert(C->getType() == T->getElementType() &&
+  for (unsigned i = 0, e = V.size(); i != e; ++i)
+    assert(V[i]->getType() == T->getElementType() &&
            "Initializer for array element doesn't match array element type!");
-    *OL = C;
-  }
+  std::copy(V.begin(), V.end(), op_begin());
 }
 
-Constant *ConstantArray::get(const ArrayType *Ty, ArrayRef<Constant*> V) {
+Constant *ConstantArray::get(ArrayType *Ty, ArrayRef<Constant*> V) {
   for (unsigned i = 0, e = V.size(); i != e; ++i) {
     assert(V[i]->getType() == Ty->getElementType() &&
            "Wrong type in array element initializer");
@@ -653,25 +663,20 @@ StructType *ConstantStruct::getTypeForElements(ArrayRef<Constant*> V,
 }
 
 
-ConstantStruct::ConstantStruct(const StructType *T,
-                               const std::vector<Constant*> &V)
+ConstantStruct::ConstantStruct(StructType *T, ArrayRef<Constant *> V)
   : Constant(T, ConstantStructVal,
              OperandTraits<ConstantStruct>::op_end(this) - V.size(),
              V.size()) {
-  assert((T->isOpaque() || V.size() == T->getNumElements()) &&
+  assert(V.size() == T->getNumElements() &&
          "Invalid initializer vector for constant structure");
-  Use *OL = OperandList;
-  for (std::vector<Constant*>::const_iterator I = V.begin(), E = V.end();
-       I != E; ++I, ++OL) {
-    Constant *C = *I;
-    assert((T->isOpaque() || C->getType() == T->getElementType(I-V.begin())) &&
+  for (unsigned i = 0, e = V.size(); i != e; ++i)
+    assert((T->isOpaque() || V[i]->getType() == T->getElementType(i)) &&
            "Initializer for struct element doesn't match struct element type!");
-    *OL = C;
-  }
+  std::copy(V.begin(), V.end(), op_begin());
 }
 
 // ConstantStruct accessors.
-Constant *ConstantStruct::get(const StructType *ST, ArrayRef<Constant*> V) {
+Constant *ConstantStruct::get(StructType *ST, ArrayRef<Constant*> V) {
   // Create a ConstantAggregateZero value if all elements are zeros.
   for (unsigned i = 0, e = V.size(); i != e; ++i)
     if (!V[i]->isNullValue())
@@ -682,7 +687,7 @@ Constant *ConstantStruct::get(const StructType *ST, ArrayRef<Constant*> V) {
   return ConstantAggregateZero::get(ST);
 }
 
-Constant* ConstantStruct::get(const StructType *T, ...) {
+Constant *ConstantStruct::get(StructType *T, ...) {
   va_list ap;
   SmallVector<Constant*, 8> Values;
   va_start(ap, T);
@@ -692,25 +697,20 @@ Constant* ConstantStruct::get(const StructType *T, ...) {
   return get(T, Values);
 }
 
-ConstantVector::ConstantVector(const VectorType *T,
-                               const std::vector<Constant*> &V)
+ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
   : Constant(T, ConstantVectorVal,
              OperandTraits<ConstantVector>::op_end(this) - V.size(),
              V.size()) {
-  Use *OL = OperandList;
-  for (std::vector<Constant*>::const_iterator I = V.begin(), E = V.end();
-       I != E; ++I, ++OL) {
-    Constant *C = *I;
-    assert(C->getType() == T->getElementType() &&
+  for (size_t i = 0, e = V.size(); i != e; i++)
+    assert(V[i]->getType() == T->getElementType() &&
            "Initializer for vector element doesn't match vector element type!");
-    *OL = C;
-  }
+  std::copy(V.begin(), V.end(), op_begin());
 }
 
 // ConstantVector accessors.
 Constant *ConstantVector::get(ArrayRef<Constant*> V) {
   assert(!V.empty() && "Vectors can't be empty");
-  const VectorType *T = VectorType::get(V.front()->getType(), V.size());
+  VectorType *T = VectorType::get(V.front()->getType(), V.size());
   LLVMContextImpl *pImpl = T->getContext().pImpl;
 
   // If this is an all-undef or all-zero vector, return a
@@ -761,7 +761,7 @@ bool ConstantExpr::isGEPWithNoNotionalOverIndexing() const {
   for (; GEPI != E; ++GEPI, ++OI) {
     ConstantInt *CI = dyn_cast<ConstantInt>(*OI);
     if (!CI) return false;
-    if (const ArrayType *ATy = dyn_cast<ArrayType>(*GEPI))
+    if (ArrayType *ATy = dyn_cast<ArrayType>(*GEPI))
       if (CI->getValue().getActiveBits() > 64 ||
           CI->getZExtValue() >= ATy->getNumElements())
         return false;
@@ -839,13 +839,13 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
     for (unsigned i = 1, e = getNumOperands(); i != e; ++i)
       Ops[i-1] = getOperand(i);
     if (OpNo == 0)
-      return cast<GEPOperator>(this)->isInBounds() ?
-        ConstantExpr::getInBoundsGetElementPtr(Op, &Ops[0], Ops.size()) :
-        ConstantExpr::getGetElementPtr(Op, &Ops[0], Ops.size());
+      return
+        ConstantExpr::getGetElementPtr(Op, Ops,
+                                       cast<GEPOperator>(this)->isInBounds());
     Ops[OpNo-1] = Op;
-    return cast<GEPOperator>(this)->isInBounds() ?
-      ConstantExpr::getInBoundsGetElementPtr(getOperand(0), &Ops[0],Ops.size()):
-      ConstantExpr::getGetElementPtr(getOperand(0), &Ops[0], Ops.size());
+    return
+      ConstantExpr::getGetElementPtr(getOperand(0), Ops,
+                                     cast<GEPOperator>(this)->isInBounds());
   }
   default:
     assert(getNumOperands() == 2 && "Must be binary operator?");
@@ -859,7 +859,7 @@ ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
 /// operands replaced with the specified values.  The specified array must
 /// have the same number of operands as our current one.
 Constant *ConstantExpr::
-getWithOperands(ArrayRef<Constant*> Ops, const Type *Ty) const {
+getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
   assert(Ops.size() == getNumOperands() && "Operand count mismatch!");
   bool AnyChange = Ty != getType();
   for (unsigned i = 0; i != Ops.size(); ++i)
@@ -891,9 +891,9 @@ getWithOperands(ArrayRef<Constant*> Ops, const Type *Ty) const {
   case Instruction::ShuffleVector:
     return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
   case Instruction::GetElementPtr:
-    return cast<GEPOperator>(this)->isInBounds() ?
-      ConstantExpr::getInBoundsGetElementPtr(Ops[0], &Ops[1], Ops.size()-1) :
-      ConstantExpr::getGetElementPtr(Ops[0], &Ops[1], Ops.size()-1);
+    return
+      ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1),
+                                     cast<GEPOperator>(this)->isInBounds());
   case Instruction::ICmp:
   case Instruction::FCmp:
     return ConstantExpr::getCompare(getPredicate(), Ops[0], Ops[1]);
@@ -907,7 +907,7 @@ getWithOperands(ArrayRef<Constant*> Ops, const Type *Ty) const {
 //===----------------------------------------------------------------------===//
 //                      isValueValidForType implementations
 
-bool ConstantInt::isValueValidForType(const Type *Ty, uint64_t Val) {
+bool ConstantInt::isValueValidForType(Type *Ty, uint64_t Val) {
   unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); // assert okay
   if (Ty == Type::getInt1Ty(Ty->getContext()))
     return Val == 0 || Val == 1;
@@ -917,7 +917,7 @@ bool ConstantInt::isValueValidForType(const Type *Ty, uint64_t Val) {
   return Val <= Max;
 }
 
-bool ConstantInt::isValueValidForType(const Type *Ty, int64_t Val) {
+bool ConstantInt::isValueValidForType(Type *Ty, int64_t Val) {
   unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth(); // assert okay
   if (Ty == Type::getInt1Ty(Ty->getContext()))
     return Val == 0 || Val == 1 || Val == -1;
@@ -928,7 +928,7 @@ bool ConstantInt::isValueValidForType(const Type *Ty, int64_t Val) {
   return (Val >= Min && Val <= Max);
 }
 
-bool ConstantFP::isValueValidForType(const Type *Ty, const APFloat& Val) {
+bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) {
   // convert modifies in place, so make a copy.
   APFloat Val2 = APFloat(Val);
   bool losesInfo;
@@ -968,7 +968,7 @@ bool ConstantFP::isValueValidForType(const Type *Ty, const APFloat& Val) {
 //===----------------------------------------------------------------------===//
 //                      Factory Function Implementation
 
-ConstantAggregateZero* ConstantAggregateZero::get(const Type* Ty) {
+ConstantAggregateZero* ConstantAggregateZero::get(Type* Ty) {
   assert((Ty->isStructTy() || Ty->isArrayTy() || Ty->isVectorTy()) &&
          "Cannot create an aggregate zero of non-aggregate type!");
   
@@ -1079,13 +1079,16 @@ bool ConstantVector::isAllOnesValue() const {
   // Check out first element.
   const Constant *Elt = getOperand(0);
   const ConstantInt *CI = dyn_cast<ConstantInt>(Elt);
-  if (!CI || !CI->isAllOnesValue()) return false;
+  const ConstantFP *CF = dyn_cast<ConstantFP>(Elt);
+
   // Then make sure all remaining elements point to the same value.
   for (unsigned I = 1, E = getNumOperands(); I < E; ++I)
     if (getOperand(I) != Elt)
       return false;
   
-  return true;
+  // First value is all-ones.
+  return (CI && CI->isAllOnesValue()) || 
+         (CF && CF->isAllOnesValue());
 }
 
 /// getSplatValue - If this is a splat constant, where all of the
@@ -1103,7 +1106,7 @@ Constant *ConstantVector::getSplatValue() const {
 //---- ConstantPointerNull::get() implementation.
 //
 
-ConstantPointerNull *ConstantPointerNull::get(const PointerType *Ty) {
+ConstantPointerNull *ConstantPointerNull::get(PointerType *Ty) {
   return Ty->getContext().pImpl->NullPtrConstants.getOrCreate(Ty, 0);
 }
 
@@ -1118,7 +1121,7 @@ void ConstantPointerNull::destroyConstant() {
 //---- UndefValue::get() implementation.
 //
 
-UndefValue *UndefValue::get(const Type *Ty) {
+UndefValue *UndefValue::get(Type *Ty) {
   return Ty->getContext().pImpl->UndefValueConstants.getOrCreate(Ty, 0);
 }
 
@@ -1209,7 +1212,7 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
 /// This is a utility function to handle folding of casts and lookup of the
 /// cast in the ExprConstants map. It is used by the various get* methods below.
 static inline Constant *getFoldedCast(
-  Instruction::CastOps opc, Constant *C, const Type *Ty) {
+  Instruction::CastOps opc, Constant *C, Type *Ty) {
   assert(Ty->isFirstClassType() && "Cannot cast to an aggregate type!");
   // Fold a few common cases
   if (Constant *FC = ConstantFoldCastInstruction(opc, C, Ty))
@@ -1224,7 +1227,7 @@ static inline Constant *getFoldedCast(
   return pImpl->ExprConstants.getOrCreate(Ty, Key);
 }
  
-Constant *ConstantExpr::getCast(unsigned oc, Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   Instruction::CastOps opc = Instruction::CastOps(oc);
   assert(Instruction::isCast(opc) && "opcode out of range");
   assert(C && Ty && "Null arguments to getCast");
@@ -1250,25 +1253,25 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, const Type *Ty) {
   return 0;
 } 
 
-Constant *ConstantExpr::getZExtOrBitCast(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getZExtOrBitCast(Constant *C, Type *Ty) {
   if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
     return getBitCast(C, Ty);
   return getZExt(C, Ty);
 }
 
-Constant *ConstantExpr::getSExtOrBitCast(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getSExtOrBitCast(Constant *C, Type *Ty) {
   if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
     return getBitCast(C, Ty);
   return getSExt(C, Ty);
 }
 
-Constant *ConstantExpr::getTruncOrBitCast(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getTruncOrBitCast(Constant *C, Type *Ty) {
   if (C->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
     return getBitCast(C, Ty);
   return getTrunc(C, Ty);
 }
 
-Constant *ConstantExpr::getPointerCast(Constant *S, const Type *Ty) {
+Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
   assert(S->getType()->isPointerTy() && "Invalid cast");
   assert((Ty->isIntegerTy() || Ty->isPointerTy()) && "Invalid cast");
 
@@ -1277,7 +1280,7 @@ Constant *ConstantExpr::getPointerCast(Constant *S, const Type *Ty) {
   return getBitCast(S, Ty);
 }
 
-Constant *ConstantExpr::getIntegerCast(Constant *C, const Type *Ty, 
+Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty, 
                                        bool isSigned) {
   assert(C->getType()->isIntOrIntVectorTy() &&
          Ty->isIntOrIntVectorTy() && "Invalid cast");
@@ -1290,7 +1293,7 @@ Constant *ConstantExpr::getIntegerCast(Constant *C, const Type *Ty,
   return getCast(opcode, C, Ty);
 }
 
-Constant *ConstantExpr::getFPCast(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getFPCast(Constant *C, Type *Ty) {
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
          "Invalid cast");
   unsigned SrcBits = C->getType()->getScalarSizeInBits();
@@ -1302,7 +1305,7 @@ Constant *ConstantExpr::getFPCast(Constant *C, const Type *Ty) {
   return getCast(opcode, C, Ty);
 }
 
-Constant *ConstantExpr::getTrunc(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1316,7 +1319,7 @@ Constant *ConstantExpr::getTrunc(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::Trunc, C, Ty);
 }
 
-Constant *ConstantExpr::getSExt(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getSExt(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1330,7 +1333,7 @@ Constant *ConstantExpr::getSExt(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::SExt, C, Ty);
 }
 
-Constant *ConstantExpr::getZExt(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getZExt(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1344,7 +1347,7 @@ Constant *ConstantExpr::getZExt(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::ZExt, C, Ty);
 }
 
-Constant *ConstantExpr::getFPTrunc(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getFPTrunc(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1356,7 +1359,7 @@ Constant *ConstantExpr::getFPTrunc(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::FPTrunc, C, Ty);
 }
 
-Constant *ConstantExpr::getFPExtend(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getFPExtend(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1368,7 +1371,7 @@ Constant *ConstantExpr::getFPExtend(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::FPExt, C, Ty);
 }
 
-Constant *ConstantExpr::getUIToFP(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getUIToFP(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1379,7 +1382,7 @@ Constant *ConstantExpr::getUIToFP(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::UIToFP, C, Ty);
 }
 
-Constant *ConstantExpr::getSIToFP(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getSIToFP(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1390,7 +1393,7 @@ Constant *ConstantExpr::getSIToFP(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::SIToFP, C, Ty);
 }
 
-Constant *ConstantExpr::getFPToUI(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getFPToUI(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1401,7 +1404,7 @@ Constant *ConstantExpr::getFPToUI(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::FPToUI, C, Ty);
 }
 
-Constant *ConstantExpr::getFPToSI(Constant *C, const Type *Ty) {
+Constant *ConstantExpr::getFPToSI(Constant *C, Type *Ty) {
 #ifndef NDEBUG
   bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
   bool toVec = Ty->getTypeID() == Type::VectorTyID;
@@ -1412,19 +1415,19 @@ Constant *ConstantExpr::getFPToSI(Constant *C, const Type *Ty) {
   return getFoldedCast(Instruction::FPToSI, C, Ty);
 }
 
-Constant *ConstantExpr::getPtrToInt(Constant *C, const Type *DstTy) {
+Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy) {
   assert(C->getType()->isPointerTy() && "PtrToInt source must be pointer");
   assert(DstTy->isIntegerTy() && "PtrToInt destination must be integral");
   return getFoldedCast(Instruction::PtrToInt, C, DstTy);
 }
 
-Constant *ConstantExpr::getIntToPtr(Constant *C, const Type *DstTy) {
+Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy) {
   assert(C->getType()->isIntegerTy() && "IntToPtr source must be integral");
   assert(DstTy->isPointerTy() && "IntToPtr destination must be a pointer");
   return getFoldedCast(Instruction::IntToPtr, C, DstTy);
 }
 
-Constant *ConstantExpr::getBitCast(Constant *C, const Type *DstTy) {
+Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
   assert(CastInst::castIsValid(Instruction::BitCast, C, DstTy) &&
          "Invalid constantexpr bitcast!");
   
@@ -1513,36 +1516,36 @@ Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
   return pImpl->ExprConstants.getOrCreate(C1->getType(), Key);
 }
 
-Constant *ConstantExpr::getSizeOf(const Type* Ty) {
+Constant *ConstantExpr::getSizeOf(Type* Ty) {
   // sizeof is implemented as: (i64) gep (Ty*)null, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
   Constant *GEPIdx = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
   Constant *GEP = getGetElementPtr(
-                 Constant::getNullValue(PointerType::getUnqual(Ty)), &GEPIdx, 1);
+                 Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx);
   return getPtrToInt(GEP, 
                      Type::getInt64Ty(Ty->getContext()));
 }
 
-Constant *ConstantExpr::getAlignOf(const Type* Ty) {
+Constant *ConstantExpr::getAlignOf(Type* Ty) {
   // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1
   // Note that a non-inbounds gep is used, as null isn't within any object.
-  const Type *AligningTy = 
+  Type *AligningTy = 
     StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, NULL);
   Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo());
   Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0);
   Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1);
   Constant *Indices[2] = { Zero, One };
-  Constant *GEP = getGetElementPtr(NullPtr, Indices, 2);
+  Constant *GEP = getGetElementPtr(NullPtr, Indices);
   return getPtrToInt(GEP,
                      Type::getInt64Ty(Ty->getContext()));
 }
 
-Constant *ConstantExpr::getOffsetOf(const StructType* STy, unsigned FieldNo) {
+Constant *ConstantExpr::getOffsetOf(StructType* STy, unsigned FieldNo) {
   return getOffsetOf(STy, ConstantInt::get(Type::getInt32Ty(STy->getContext()),
                                            FieldNo));
 }
 
-Constant *ConstantExpr::getOffsetOf(const Type* Ty, Constant *FieldNo) {
+Constant *ConstantExpr::getOffsetOf(Type* Ty, Constant *FieldNo) {
   // offsetof is implemented as: (i64) gep (Ty*)null, 0, FieldNo
   // Note that a non-inbounds gep is used, as null isn't within any object.
   Constant *GEPIdx[] = {
@@ -1550,7 +1553,7 @@ Constant *ConstantExpr::getOffsetOf(const Type* Ty, Constant *FieldNo) {
     FieldNo
   };
   Constant *GEP = getGetElementPtr(
-                Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx, 2);
+                Constant::getNullValue(PointerType::getUnqual(Ty)), GEPIdx);
   return getPtrToInt(GEP,
                      Type::getInt64Ty(Ty->getContext()));
 }
@@ -1592,14 +1595,13 @@ Constant *ConstantExpr::getSelect(Constant *C, Constant *V1, Constant *V2) {
   return pImpl->ExprConstants.getOrCreate(V1->getType(), Key);
 }
 
-Constant *ConstantExpr::getGetElementPtr(Constant *C, Value* const *Idxs,
-                                         unsigned NumIdx, bool InBounds) {
-  if (Constant *FC = ConstantFoldGetElementPtr(C, InBounds, Idxs, NumIdx))
+Constant *ConstantExpr::getGetElementPtr(Constant *C, ArrayRef<Value *> Idxs,
+                                         bool InBounds) {
+  if (Constant *FC = ConstantFoldGetElementPtr(C, InBounds, Idxs))
     return FC;          // Fold a few common cases.
 
   // Get the result type of the getelementptr!
-  const Type *Ty = 
-    GetElementPtrInst::getIndexedType(C->getType(), Idxs, Idxs+NumIdx);
+  Type *Ty = GetElementPtrInst::getIndexedType(C->getType(), Idxs);
   assert(Ty && "GEP indices invalid!");
   unsigned AS = cast<PointerType>(C->getType())->getAddressSpace();
   Type *ReqTy = Ty->getPointerTo(AS);
@@ -1608,9 +1610,9 @@ Constant *ConstantExpr::getGetElementPtr(Constant *C, Value* const *Idxs,
          "Non-pointer type for constant GetElementPtr expression");
   // Look up the constant in the table first to ensure uniqueness
   std::vector<Constant*> ArgVec;
-  ArgVec.reserve(NumIdx+1);
+  ArgVec.reserve(1 + Idxs.size());
   ArgVec.push_back(C);
-  for (unsigned i = 0; i != NumIdx; ++i)
+  for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
     ArgVec.push_back(cast<Constant>(Idxs[i]));
   const ExprMapKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
                            InBounds ? GEPOperator::IsInBounds : 0);
@@ -1635,8 +1637,8 @@ ConstantExpr::getICmp(unsigned short pred, Constant *LHS, Constant *RHS) {
   // Get the key type with both the opcode and predicate
   const ExprMapKeyType Key(Instruction::ICmp, ArgVec, pred);
 
-  const Type *ResultTy = Type::getInt1Ty(LHS->getContext());
-  if (const VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
+  Type *ResultTy = Type::getInt1Ty(LHS->getContext());
+  if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
     ResultTy = VectorType::get(ResultTy, VT->getNumElements());
 
   LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl;
@@ -1658,8 +1660,8 @@ ConstantExpr::getFCmp(unsigned short pred, Constant *LHS, Constant *RHS) {
   // Get the key type with both the opcode and predicate
   const ExprMapKeyType Key(Instruction::FCmp, ArgVec, pred);
 
-  const Type *ResultTy = Type::getInt1Ty(LHS->getContext());
-  if (const VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
+  Type *ResultTy = Type::getInt1Ty(LHS->getContext());
+  if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
     ResultTy = VectorType::get(ResultTy, VT->getNumElements());
 
   LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl;
@@ -1715,8 +1717,8 @@ Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2,
     return FC;          // Fold a few common cases.
 
   unsigned NElts = cast<VectorType>(Mask->getType())->getNumElements();
-  const Type *EltTy = cast<VectorType>(V1->getType())->getElementType();
-  const Type *ShufTy = VectorType::get(EltTy, NElts);
+  Type *EltTy = cast<VectorType>(V1->getType())->getElementType();
+  Type *ShufTy = VectorType::get(EltTy, NElts);
 
   // Look up the constant in the table first to ensure uniqueness
   std::vector<Constant*> ArgVec(1, V1);
@@ -1745,7 +1747,7 @@ Constant *ConstantExpr::getExtractValue(Constant *Agg,
   assert(Agg->getType()->isFirstClassType() &&
          "Tried to create extractelement operation on non-first-class type!");
 
-  const Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs);
+  Type *ReqTy = ExtractValueInst::getIndexedType(Agg->getType(), Idxs);
   (void)ReqTy;
   assert(ReqTy && "extractvalue indices invalid!");
   
@@ -1878,7 +1880,7 @@ const char *ConstantExpr::getOpcodeName() const {
 
 GetElementPtrConstantExpr::
 GetElementPtrConstantExpr(Constant *C, const std::vector<Constant*> &IdxList,
-                          const Type *DestTy)
+                          Type *DestTy)
   : ConstantExpr(DestTy, Instruction::GetElementPtr,
                  OperandTraits<GetElementPtrConstantExpr>::op_end(this)
                  - (IdxList.size()+1), IdxList.size()+1) {
@@ -2091,8 +2093,7 @@ void ConstantExpr::replaceUsesOfWithOnConstant(Value *From, Value *ToV,
       if (Val == From) Val = To;
       Indices.push_back(Val);
     }
-    Replacement = ConstantExpr::getGetElementPtr(Pointer,
-                                                 &Indices[0], Indices.size(),
+    Replacement = ConstantExpr::getGetElementPtr(Pointer, Indices,
                                          cast<GEPOperator>(this)->isInBounds());
   } else if (getOpcode() == Instruction::ExtractValue) {
     Constant *Agg = getOperand(0);
diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index bd134d9..1077004 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h
@@ -36,7 +36,7 @@ public:
   void *operator new(size_t s) {
     return User::operator new(s, 1);
   }
-  UnaryConstantExpr(unsigned Opcode, Constant *C, const Type *Ty)
+  UnaryConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
     : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
     Op<0>() = C;
   }
@@ -159,7 +159,7 @@ public:
   }
   ExtractValueConstantExpr(Constant *Agg,
                            const SmallVector<unsigned, 4> &IdxList,
-                           const Type *DestTy)
+                           Type *DestTy)
     : ConstantExpr(DestTy, Instruction::ExtractValue, &Op<0>(), 1),
       Indices(IdxList) {
     Op<0>() = Agg;
@@ -184,7 +184,7 @@ public:
   }
   InsertValueConstantExpr(Constant *Agg, Constant *Val,
                           const SmallVector<unsigned, 4> &IdxList,
-                          const Type *DestTy)
+                          Type *DestTy)
     : ConstantExpr(DestTy, Instruction::InsertValue, &Op<0>(), 2),
       Indices(IdxList) {
     Op<0>() = Agg;
@@ -203,11 +203,11 @@ public:
 /// used behind the scenes to implement getelementpr constant exprs.
 class GetElementPtrConstantExpr : public ConstantExpr {
   GetElementPtrConstantExpr(Constant *C, const std::vector<Constant*> &IdxList,
-                            const Type *DestTy);
+                            Type *DestTy);
 public:
   static GetElementPtrConstantExpr *Create(Constant *C,
                                            const std::vector<Constant*>&IdxList,
-                                           const Type *DestTy,
+                                           Type *DestTy,
                                            unsigned Flags) {
     GetElementPtrConstantExpr *Result =
       new(IdxList.size() + 1) GetElementPtrConstantExpr(C, IdxList, DestTy);
@@ -228,7 +228,7 @@ struct CompareConstantExpr : public ConstantExpr {
     return User::operator new(s, 2);
   }
   unsigned short predicate;
-  CompareConstantExpr(const Type *ty, Instruction::OtherOps opc,
+  CompareConstantExpr(Type *ty, Instruction::OtherOps opc,
                       unsigned short pred,  Constant* LHS, Constant* RHS)
     : ConstantExpr(ty, opc, &Op<0>(), 2), predicate(pred) {
     Op<0>() = LHS;
@@ -392,7 +392,7 @@ struct ConstantTraits<Constant *> {
 
 template<class ConstantClass, class TypeClass, class ValType>
 struct ConstantCreator {
-  static ConstantClass *create(const TypeClass *Ty, const ValType &V) {
+  static ConstantClass *create(TypeClass *Ty, const ValType &V) {
     return new(ConstantTraits<ValType>::uses(V)) ConstantClass(Ty, V);
   }
 };
@@ -407,7 +407,7 @@ struct ConstantKeyData {
 
 template<>
 struct ConstantCreator<ConstantExpr, Type, ExprMapKeyType> {
-  static ConstantExpr *create(const Type *Ty, const ExprMapKeyType &V,
+  static ConstantExpr *create(Type *Ty, const ExprMapKeyType &V,
       unsigned short pred = 0) {
     if (Instruction::isCast(V.opcode))
       return new UnaryConstantExpr(V.opcode, V.operands[0], Ty);
@@ -470,7 +470,7 @@ struct ConstantKeyData<ConstantExpr> {
 // ConstantAggregateZero does not take extra "value" argument...
 template<class ValType>
 struct ConstantCreator<ConstantAggregateZero, Type, ValType> {
-  static ConstantAggregateZero *create(const Type *Ty, const ValType &V){
+  static ConstantAggregateZero *create(Type *Ty, const ValType &V){
     return new ConstantAggregateZero(Ty);
   }
 };
@@ -522,7 +522,7 @@ struct ConstantKeyData<ConstantStruct> {
 // ConstantPointerNull does not take extra "value" argument...
 template<class ValType>
 struct ConstantCreator<ConstantPointerNull, PointerType, ValType> {
-  static ConstantPointerNull *create(const PointerType *Ty, const ValType &V){
+  static ConstantPointerNull *create(PointerType *Ty, const ValType &V){
     return new ConstantPointerNull(Ty);
   }
 };
@@ -538,7 +538,7 @@ struct ConstantKeyData<ConstantPointerNull> {
 // UndefValue does not take extra "value" argument...
 template<class ValType>
 struct ConstantCreator<UndefValue, Type, ValType> {
-  static UndefValue *create(const Type *Ty, const ValType &V) {
+  static UndefValue *create(Type *Ty, const ValType &V) {
     return new UndefValue(Ty);
   }
 };
@@ -553,7 +553,7 @@ struct ConstantKeyData<UndefValue> {
 
 template<>
 struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType> {
-  static InlineAsm *create(const PointerType *Ty, const InlineAsmKeyType &Key) {
+  static InlineAsm *create(PointerType *Ty, const InlineAsmKeyType &Key) {
     return new InlineAsm(Ty, Key.asm_string, Key.constraints,
                          Key.has_side_effects, Key.is_align_stack);
   }
@@ -572,7 +572,7 @@ template<class ValType, class ValRefType, class TypeClass, class ConstantClass,
          bool HasLargeKey = false /*true for arrays and structs*/ >
 class ConstantUniqueMap {
 public:
-  typedef std::pair<const TypeClass*, ValType> MapKey;
+  typedef std::pair<TypeClass*, ValType> MapKey;
   typedef std::map<MapKey, ConstantClass *> MapTy;
   typedef std::map<ConstantClass *, typename MapTy::iterator> InverseMapTy;
 private:
@@ -623,7 +623,7 @@ private:
     }
       
     typename MapTy::iterator I =
-      Map.find(MapKey(static_cast<const TypeClass*>(CP->getType()),
+      Map.find(MapKey(static_cast<TypeClass*>(CP->getType()),
                       ConstantKeyData<ConstantClass>::getValType(CP)));
     if (I == Map.end() || I->second != CP) {
       // FIXME: This should not use a linear scan.  If this gets to be a
@@ -634,7 +634,7 @@ private:
     return I;
   }
 
-  ConstantClass *Create(const TypeClass *Ty, ValRefType V,
+  ConstantClass *Create(TypeClass *Ty, ValRefType V,
                         typename MapTy::iterator I) {
     ConstantClass* Result =
       ConstantCreator<ConstantClass,TypeClass,ValType>::create(Ty, V);
@@ -651,7 +651,7 @@ public:
     
   /// getOrCreate - Return the specified constant from the map, creating it if
   /// necessary.
-  ConstantClass *getOrCreate(const TypeClass *Ty, ValRefType V) {
+  ConstantClass *getOrCreate(TypeClass *Ty, ValRefType V) {
     MapKey Lookup(Ty, V);
     ConstantClass* Result = 0;
     
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index 2a816e1..a505e4b 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -167,6 +167,11 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
   }
 }
 
+LLVMBool LLVMTypeIsSized(LLVMTypeRef Ty)
+{
+    return unwrap(Ty)->isSized();
+}
+
 LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
   return wrap(&unwrap(Ty)->getContext());
 }
@@ -299,7 +304,15 @@ LLVMTypeRef LLVMStructType(LLVMTypeRef *ElementTypes,
 
 LLVMTypeRef LLVMStructCreateNamed(LLVMContextRef C, const char *Name)
 {
-  return wrap(StructType::createNamed(*unwrap(C), Name));
+  return wrap(StructType::create(*unwrap(C), Name));
+}
+
+const char *LLVMGetStructName(LLVMTypeRef Ty)
+{
+  StructType *Type = unwrap<StructType>(Ty);
+  if (!Type->hasName())
+    return 0;
+  return Type->getName().data();
 }
 
 void LLVMStructSetBody(LLVMTypeRef StructTy, LLVMTypeRef *ElementTypes,
@@ -448,7 +461,10 @@ LLVMValueRef LLVMGetUsedValue(LLVMUseRef U) {
 
 /*--.. Operations on Users .................................................--*/
 LLVMValueRef LLVMGetOperand(LLVMValueRef Val, unsigned Index) {
-  return wrap(unwrap<User>(Val)->getOperand(Index));
+  Value *V = unwrap(Val);
+  if (MDNode *MD = dyn_cast<MDNode>(V))
+      return wrap(MD->getOperand(Index));
+  return wrap(cast<User>(V)->getOperand(Index));
 }
 
 void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) {
@@ -456,7 +472,10 @@ void LLVMSetOperand(LLVMValueRef Val, unsigned Index, LLVMValueRef Op) {
 }
 
 int LLVMGetNumOperands(LLVMValueRef Val) {
-  return unwrap<User>(Val)->getNumOperands();
+  Value *V = unwrap(Val);
+  if (MDNode *MD = dyn_cast<MDNode>(V))
+      return MD->getNumOperands();
+  return cast<User>(V)->getNumOperands();
 }
 
 /*--.. Operations on constants of any type .................................--*/
@@ -506,13 +525,39 @@ LLVMValueRef LLVMMDString(const char *Str, unsigned SLen) {
 LLVMValueRef LLVMMDNodeInContext(LLVMContextRef C, LLVMValueRef *Vals,
                                  unsigned Count) {
   return wrap(MDNode::get(*unwrap(C),
-                          ArrayRef<Value*>(unwrap<Value>(Vals, Count), Count)));
+                          makeArrayRef(unwrap<Value>(Vals, Count), Count)));
 }
 
 LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count) {
   return LLVMMDNodeInContext(LLVMGetGlobalContext(), Vals, Count);
 }
 
+const char *LLVMGetMDString(LLVMValueRef V, unsigned* Len) {
+  if (const MDString *S = dyn_cast<MDString>(unwrap(V))) {
+    *Len = S->getString().size();
+    return S->getString().data();
+  }
+  *Len = 0;
+  return 0;
+}
+
+unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char* name)
+{
+  if (NamedMDNode *N = unwrap(M)->getNamedMetadata(name)) {
+    return N->getNumOperands();
+  }
+  return 0;
+}
+
+void LLVMGetNamedMetadataOperands(LLVMModuleRef M, const char* name, LLVMValueRef *Dest)
+{
+  NamedMDNode *N = unwrap(M)->getNamedMetadata(name);
+  if (!N)
+    return;
+  for (unsigned i=0;i<N->getNumOperands();i++)
+    Dest[i] = wrap(N->getOperand(i));
+}
+
 /*--.. Operations on scalar constants ......................................--*/
 
 LLVMValueRef LLVMConstInt(LLVMTypeRef IntTy, unsigned long long N,
@@ -525,7 +570,8 @@ LLVMValueRef LLVMConstIntOfArbitraryPrecision(LLVMTypeRef IntTy,
                                               const uint64_t Words[]) {
     IntegerType *Ty = unwrap<IntegerType>(IntTy);
     return wrap(ConstantInt::get(Ty->getContext(),
-                                 APInt(Ty->getBitWidth(), NumWords, Words)));
+                                 APInt(Ty->getBitWidth(),
+                                       makeArrayRef(Words, NumWords))));
 }
 
 LLVMValueRef LLVMConstIntOfString(LLVMTypeRef IntTy, const char Str[],
@@ -575,8 +621,7 @@ LLVMValueRef LLVMConstStructInContext(LLVMContextRef C,
                                       LLVMValueRef *ConstantVals,
                                       unsigned Count, LLVMBool Packed) {
   Constant **Elements = unwrap<Constant>(ConstantVals, Count);
-  return wrap(ConstantStruct::getAnon(*unwrap(C),
-                                      ArrayRef<Constant*>(Elements, Count),
+  return wrap(ConstantStruct::getAnon(*unwrap(C), makeArrayRef(Elements, Count),
                                       Packed != 0));
 }
 
@@ -600,19 +645,44 @@ LLVMValueRef LLVMConstNamedStruct(LLVMTypeRef StructTy,
                                   LLVMValueRef *ConstantVals,
                                   unsigned Count) {
   Constant **Elements = unwrap<Constant>(ConstantVals, Count);
-  const StructType *Ty = cast<StructType>(unwrap(StructTy));
+  StructType *Ty = cast<StructType>(unwrap(StructTy));
 
-  return wrap(ConstantStruct::get(Ty, ArrayRef<Constant*>(Elements, Count)));
+  return wrap(ConstantStruct::get(Ty, makeArrayRef(Elements, Count)));
 }
 
 LLVMValueRef LLVMConstVector(LLVMValueRef *ScalarConstantVals, unsigned Size) {
-  return wrap(ConstantVector::get(ArrayRef<Constant*>(
+  return wrap(ConstantVector::get(makeArrayRef(
                             unwrap<Constant>(ScalarConstantVals, Size), Size)));
 }
+
+/*-- Opcode mapping */
+
+static LLVMOpcode map_to_llvmopcode(int opcode)
+{
+    switch (opcode) {
+      default:
+        assert(0 && "Unhandled Opcode.");
+#define HANDLE_INST(num, opc, clas) case num: return LLVM##opc;
+#include "llvm/Instruction.def"
+#undef HANDLE_INST
+    }
+}
+
+static int map_from_llvmopcode(LLVMOpcode code)
+{
+    switch (code) {
+      default:
+        assert(0 && "Unhandled Opcode.");
+#define HANDLE_INST(num, opc, clas) case LLVM##opc: return num;
+#include "llvm/Instruction.def"
+#undef HANDLE_INST
+    }
+}
+
 /*--.. Constant expressions ................................................--*/
 
 LLVMOpcode LLVMGetConstOpcode(LLVMValueRef ConstantVal) {
-  return (LLVMOpcode)unwrap<ConstantExpr>(ConstantVal)->getOpcode();
+  return map_to_llvmopcode(unwrap<ConstantExpr>(ConstantVal)->getOpcode());
 }
 
 LLVMValueRef LLVMAlignOf(LLVMTypeRef Ty) {
@@ -792,18 +862,19 @@ LLVMValueRef LLVMConstAShr(LLVMValueRef LHSConstant, LLVMValueRef RHSConstant) {
 
 LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal,
                           LLVMValueRef *ConstantIndices, unsigned NumIndices) {
+  ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
+                               NumIndices);
   return wrap(ConstantExpr::getGetElementPtr(unwrap<Constant>(ConstantVal),
-                                             unwrap<Constant>(ConstantIndices, 
-                                                              NumIndices),
-                                             NumIndices));
+                                             IdxList));
 }
 
 LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal,
                                   LLVMValueRef *ConstantIndices,
                                   unsigned NumIndices) {
   Constant* Val = unwrap<Constant>(ConstantVal);
-  Constant** Idxs = unwrap<Constant>(ConstantIndices, NumIndices);
-  return wrap(ConstantExpr::getInBoundsGetElementPtr(Val, Idxs, NumIndices));
+  ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
+                               NumIndices);
+  return wrap(ConstantExpr::getInBoundsGetElementPtr(Val, IdxList));
 }
 
 LLVMValueRef LLVMConstTrunc(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
@@ -934,8 +1005,7 @@ LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
 LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
                                    unsigned NumIdx) {
   return wrap(ConstantExpr::getExtractValue(unwrap<Constant>(AggConstant),
-                                            ArrayRef<unsigned>(IdxList,
-                                                               NumIdx)));
+                                            makeArrayRef(IdxList, NumIdx)));
 }
 
 LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
@@ -943,8 +1013,7 @@ LLVMValueRef LLVMConstInsertValue(LLVMValueRef AggConstant,
                                   unsigned *IdxList, unsigned NumIdx) {
   return wrap(ConstantExpr::getInsertValue(unwrap<Constant>(AggConstant),
                                          unwrap<Constant>(ElementValueConstant),
-                                           ArrayRef<unsigned>(IdxList,
-                                                              NumIdx)));
+                                           makeArrayRef(IdxList, NumIdx)));
 }
 
 LLVMValueRef LLVMConstInlineAsm(LLVMTypeRef Ty, const char *AsmString,
@@ -1383,6 +1452,10 @@ LLVMValueRef LLVMGetBasicBlockParent(LLVMBasicBlockRef BB) {
   return wrap(unwrap(BB)->getParent());
 }
 
+LLVMValueRef LLVMGetBasicBlockTerminator(LLVMBasicBlockRef BB) {
+  return wrap(unwrap(BB)->getTerminator());
+}
+
 unsigned LLVMCountBasicBlocks(LLVMValueRef FnRef) {
   return unwrap<Function>(FnRef)->size();
 }
@@ -1455,6 +1528,10 @@ void LLVMDeleteBasicBlock(LLVMBasicBlockRef BBRef) {
   unwrap(BBRef)->eraseFromParent();
 }
 
+void LLVMRemoveBasicBlockFromParent(LLVMBasicBlockRef BBRef) {
+  unwrap(BBRef)->removeFromParent();
+}
+
 void LLVMMoveBasicBlockBefore(LLVMBasicBlockRef BB, LLVMBasicBlockRef MovePos) {
   unwrap(BB)->moveBefore(unwrap(MovePos));
 }
@@ -1501,6 +1578,25 @@ LLVMValueRef LLVMGetPreviousInstruction(LLVMValueRef Inst) {
   return wrap(--I);
 }
 
+void LLVMInstructionEraseFromParent(LLVMValueRef Inst) {
+  unwrap<Instruction>(Inst)->eraseFromParent();
+}
+
+LLVMIntPredicate LLVMGetICmpPredicate(LLVMValueRef Inst) {
+  if (ICmpInst *I = dyn_cast<ICmpInst>(unwrap(Inst)))
+    return (LLVMIntPredicate)I->getPredicate();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(unwrap(Inst)))
+    if (CE->getOpcode() == Instruction::ICmp)
+      return (LLVMIntPredicate)CE->getPredicate();
+  return (LLVMIntPredicate)0;
+}
+
+LLVMOpcode LLVMGetInstructionOpcode(LLVMValueRef Inst) {
+  if (Instruction *C = dyn_cast<Instruction>(unwrap(Inst)))
+    return map_to_llvmopcode(C->getOpcode());
+  return (LLVMOpcode)0;
+}
+
 /*--.. Call and invoke instructions ........................................--*/
 
 unsigned LLVMGetInstructionCallConv(LLVMValueRef Instr) {
@@ -1554,6 +1650,12 @@ void LLVMSetTailCall(LLVMValueRef Call, LLVMBool isTailCall) {
   unwrap<CallInst>(Call)->setTailCall(isTailCall);
 }
 
+/*--.. Operations on switch instructions (only) ............................--*/
+
+LLVMBasicBlockRef LLVMGetSwitchDefaultDest(LLVMValueRef Switch) {
+  return wrap(unwrap<SwitchInst>(Switch)->getDefaultDest());
+}
+
 /*--.. Operations on phi nodes .............................................--*/
 
 void LLVMAddIncoming(LLVMValueRef PhiNode, LLVMValueRef *IncomingValues,
@@ -1680,12 +1782,20 @@ LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef B, LLVMValueRef Fn,
                              LLVMBasicBlockRef Then, LLVMBasicBlockRef Catch,
                              const char *Name) {
   return wrap(unwrap(B)->CreateInvoke(unwrap(Fn), unwrap(Then), unwrap(Catch),
-                                      ArrayRef<Value *>(unwrap(Args), NumArgs),
+                                      makeArrayRef(unwrap(Args), NumArgs),
                                       Name));
 }
 
-LLVMValueRef LLVMBuildUnwind(LLVMBuilderRef B) {
-  return wrap(unwrap(B)->CreateUnwind());
+LLVMValueRef LLVMBuildLandingPad(LLVMBuilderRef B, LLVMTypeRef Ty,
+                                 LLVMValueRef PersFn, unsigned NumClauses,
+                                 const char *Name) {
+  return wrap(unwrap(B)->CreateLandingPad(unwrap(Ty),
+                                          cast<Function>(unwrap(PersFn)),
+                                          NumClauses, Name));
+}
+
+LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn) {
+  return wrap(unwrap(B)->CreateResume(unwrap(Exn)));
 }
 
 LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef B) {
@@ -1701,6 +1811,15 @@ void LLVMAddDestination(LLVMValueRef IndirectBr, LLVMBasicBlockRef Dest) {
   unwrap<IndirectBrInst>(IndirectBr)->addDestination(unwrap(Dest));
 }
 
+void LLVMAddClause(LLVMValueRef LandingPad, LLVMValueRef ClauseVal) {
+  unwrap<LandingPadInst>(LandingPad)->
+    addClause(cast<Constant>(unwrap(ClauseVal)));
+}
+
+void LLVMSetCleanup(LLVMValueRef LandingPad, LLVMBool Val) {
+  unwrap<LandingPadInst>(LandingPad)->setCleanup(Val);
+}
+
 /*--.. Arithmetic ..........................................................--*/
 
 LLVMValueRef LLVMBuildAdd(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
@@ -1831,7 +1950,7 @@ LLVMValueRef LLVMBuildXor(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS,
 LLVMValueRef LLVMBuildBinOp(LLVMBuilderRef B, LLVMOpcode Op,
                             LLVMValueRef LHS, LLVMValueRef RHS,
                             const char *Name) {
-  return wrap(unwrap(B)->CreateBinOp(Instruction::BinaryOps(Op), unwrap(LHS),
+  return wrap(unwrap(B)->CreateBinOp(Instruction::BinaryOps(map_from_llvmopcode(Op)), unwrap(LHS),
                                      unwrap(RHS), Name));
 }
 
@@ -1861,7 +1980,7 @@ LLVMValueRef LLVMBuildNot(LLVMBuilderRef B, LLVMValueRef V, const char *Name) {
 
 LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
                              const char *Name) {
-  const Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
+  Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
   Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
@@ -1872,7 +1991,7 @@ LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
 
 LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
                                   LLVMValueRef Val, const char *Name) {
-  const Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
+  Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
   Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
@@ -1910,15 +2029,15 @@ LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val,
 LLVMValueRef LLVMBuildGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
                           LLVMValueRef *Indices, unsigned NumIndices,
                           const char *Name) {
-  return wrap(unwrap(B)->CreateGEP(unwrap(Pointer), unwrap(Indices),
-                                   unwrap(Indices) + NumIndices, Name));
+  ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
+  return wrap(unwrap(B)->CreateGEP(unwrap(Pointer), IdxList, Name));
 }
 
 LLVMValueRef LLVMBuildInBoundsGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
                                   LLVMValueRef *Indices, unsigned NumIndices,
                                   const char *Name) {
-  return wrap(unwrap(B)->CreateInBoundsGEP(unwrap(Pointer), unwrap(Indices),
-                                           unwrap(Indices) + NumIndices, Name));
+  ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
+  return wrap(unwrap(B)->CreateInBoundsGEP(unwrap(Pointer), IdxList, Name));
 }
 
 LLVMValueRef LLVMBuildStructGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
@@ -2018,7 +2137,7 @@ LLVMValueRef LLVMBuildTruncOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
 
 LLVMValueRef LLVMBuildCast(LLVMBuilderRef B, LLVMOpcode Op, LLVMValueRef Val,
                            LLVMTypeRef DestTy, const char *Name) {
-  return wrap(unwrap(B)->CreateCast(Instruction::CastOps(Op), unwrap(Val),
+  return wrap(unwrap(B)->CreateCast(Instruction::CastOps(map_from_llvmopcode(Op)), unwrap(Val),
                                     unwrap(DestTy), Name));
 }
 
@@ -2064,7 +2183,7 @@ LLVMValueRef LLVMBuildCall(LLVMBuilderRef B, LLVMValueRef Fn,
                            LLVMValueRef *Args, unsigned NumArgs,
                            const char *Name) {
   return wrap(unwrap(B)->CreateCall(unwrap(Fn),
-                                    ArrayRef<Value *>(unwrap(Args), NumArgs),
+                                    makeArrayRef(unwrap(Args), NumArgs),
                                     Name));
 }
 
diff --git a/lib/VMCore/DebugLoc.cpp b/lib/VMCore/DebugLoc.cpp
index 4ff6b2c..328244f 100644
--- a/lib/VMCore/DebugLoc.cpp
+++ b/lib/VMCore/DebugLoc.cpp
@@ -104,7 +104,7 @@ MDNode *DebugLoc::getAsMDNode(const LLVMContext &Ctx) const {
   assert(Scope && "If scope is null, this should be isUnknown()");
   
   LLVMContext &Ctx2 = Scope->getContext();
-  const Type *Int32 = Type::getInt32Ty(Ctx2);
+  Type *Int32 = Type::getInt32Ty(Ctx2);
   Value *Elts[] = {
     ConstantInt::get(Int32, getLine()), ConstantInt::get(Int32, getCol()),
     Scope, IA
@@ -240,7 +240,7 @@ int LLVMContextImpl::getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,
 /// deleted - The MDNode this is pointing to got deleted, so this pointer needs
 /// to drop to null and we need remove our entry from the DenseMap.
 void DebugRecVH::deleted() {
-  // If this is a  non-canonical reference, just drop the value to null, we know
+  // If this is a non-canonical reference, just drop the value to null, we know
   // it doesn't have a map entry.
   if (Idx == 0) {
     setValPtr(0);
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index 6536bcd..1215e6a 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -17,6 +17,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/InstIterator.h"
 #include "llvm/Support/LeakDetector.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/StringPool.h"
@@ -38,7 +39,7 @@ template class llvm::SymbolTableListTraits<BasicBlock, Function>;
 // Argument Implementation
 //===----------------------------------------------------------------------===//
 
-Argument::Argument(const Type *Ty, const Twine &Name, Function *Par)
+Argument::Argument(Type *Ty, const Twine &Name, Function *Par)
   : Value(Ty, Value::ArgumentVal) {
   Parent = 0;
 
@@ -158,7 +159,7 @@ void Function::eraseFromParent() {
 // Function Implementation
 //===----------------------------------------------------------------------===//
 
-Function::Function(const FunctionType *Ty, LinkageTypes Linkage,
+Function::Function(FunctionType *Ty, LinkageTypes Linkage,
                    const Twine &name, Module *ParentModule)
   : GlobalValue(PointerType::getUnqual(Ty), 
                 Value::FunctionVal, 0, 0, Linkage, name) {
@@ -195,7 +196,7 @@ Function::~Function() {
 
 void Function::BuildLazyArguments() const {
   // Create the arguments vector, all arguments start out unnamed.
-  const FunctionType *FT = getFunctionType();
+  FunctionType *FT = getFunctionType();
   for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
     assert(!FT->getParamType(i)->isVoidTy() &&
            "Cannot have void typed arguments!");
@@ -345,7 +346,7 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
     return Table[id];
   std::string Result(Table[id]);
   for (unsigned i = 0; i < Tys.size(); ++i) {
-    if (const PointerType* PTyp = dyn_cast<PointerType>(Tys[i])) {
+    if (PointerType* PTyp = dyn_cast<PointerType>(Tys[i])) {
       Result += ".p" + llvm::utostr(PTyp->getAddressSpace()) + 
                 EVT::getEVT(PTyp->getElementType()).getEVTString();
     }
@@ -355,9 +356,9 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   return Result;
 }
 
-const FunctionType *Intrinsic::getType(LLVMContext &Context,
+FunctionType *Intrinsic::getType(LLVMContext &Context,
                                        ID id, ArrayRef<Type*> Tys) {
-  const Type *ResultTy = NULL;
+  Type *ResultTy = NULL;
   std::vector<Type*> ArgTys;
   bool IsVarArg = false;
   
@@ -416,8 +417,7 @@ bool Function::hasAddressTaken(const User* *PutOffender) const {
 /// FIXME: Remove after <rdar://problem/8031714> is fixed.
 /// FIXME: Is the above FIXME valid?
 bool Function::callsFunctionThatReturnsTwice() const {
-  const Module *M = this->getParent();
-  static const char *ReturnsTwiceFns[] = {
+  static const char *const ReturnsTwiceFns[] = {
     "_setjmp",
     "setjmp",
     "sigsetjmp",
@@ -428,16 +428,25 @@ bool Function::callsFunctionThatReturnsTwice() const {
     "getcontext"
   };
 
-  for (unsigned I = 0; I < array_lengthof(ReturnsTwiceFns); ++I)
-    if (const Function *Callee = M->getFunction(ReturnsTwiceFns[I])) {
-      if (!Callee->use_empty())
-        for (Value::const_use_iterator
-               I = Callee->use_begin(), E = Callee->use_end();
-             I != E; ++I)
-          if (const CallInst *CI = dyn_cast<CallInst>(*I))
-            if (CI->getParent()->getParent() == this)
-              return true;
+  for (const_inst_iterator I = inst_begin(this), E = inst_end(this); I != E;
+       ++I) {
+    const CallInst* callInst = dyn_cast<CallInst>(&*I);
+    if (!callInst)
+      continue;
+    if (callInst->canReturnTwice())
+      return true;
+
+    // check for known function names.
+    // FIXME: move this to clang.
+    Function *F = callInst->getCalledFunction();
+    if (!F)
+      continue;
+    StringRef Name = F->getName();
+    for (unsigned J = 0, e = array_lengthof(ReturnsTwiceFns); J != e; ++J) {
+      if (Name == ReturnsTwiceFns[J])
+        return true;
     }
+  }
 
   return false;
 }
diff --git a/lib/VMCore/GCOV.cpp b/lib/VMCore/GCOV.cpp
new file mode 100644
index 0000000..fc7f96f
--- /dev/null
+++ b/lib/VMCore/GCOV.cpp
@@ -0,0 +1,281 @@
+//===- GCOVr.cpp - LLVM coverage tool -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// GCOV implements the interface to read and write coverage files that use 
+// 'gcov' format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/GCOV.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/system_error.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// GCOVFile implementation.
+
+/// ~GCOVFile - Delete GCOVFile and its content.
+GCOVFile::~GCOVFile() {
+  DeleteContainerPointers(Functions);
+}
+
+/// isGCDAFile - Return true if Format identifies a .gcda file.
+static bool isGCDAFile(GCOVFormat Format) {
+  return Format == GCDA_402 || Format == GCDA_404;
+}
+
+/// isGCNOFile - Return true if Format identifies a .gcno file.
+static bool isGCNOFile(GCOVFormat Format) {
+  return Format == GCNO_402 || Format == GCNO_404;
+}
+
+/// read - Read GCOV buffer.
+bool GCOVFile::read(GCOVBuffer &Buffer) {
+  GCOVFormat Format = Buffer.readGCOVFormat();
+  if (Format == InvalidGCOV)
+    return false;
+
+  unsigned i = 0;
+  while (1) {
+    GCOVFunction *GFun = NULL;
+    if (isGCDAFile(Format)) {
+      // Use existing function while reading .gcda file.
+      assert (i < Functions.size() && ".gcda data does not match .gcno data");
+      GFun = Functions[i];
+    } else if (isGCNOFile(Format)){
+      GFun = new GCOVFunction();
+      Functions.push_back(GFun);
+    }
+    if (!GFun || !GFun->read(Buffer, Format))
+      break;
+    ++i;
+  }
+  return true;
+}
+
+/// dump - Dump GCOVFile content on standard out for debugging purposes.
+void GCOVFile::dump() {
+  for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
+	 E = Functions.end(); I != E; ++I)
+    (*I)->dump();
+}
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVFile::collectLineCounts(FileInfo &FI) {
+  for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
+	 E = Functions.end(); I != E; ++I) 
+    (*I)->collectLineCounts(FI);
+  FI.print();
+}
+
+//===----------------------------------------------------------------------===//
+// GCOVFunction implementation.
+
+/// ~GCOVFunction - Delete GCOVFunction and its content.
+GCOVFunction::~GCOVFunction() {
+  DeleteContainerPointers(Blocks);
+}
+
+/// read - Read a aunction from the buffer. Return false if buffer cursor
+/// does not point to a function tag.
+bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
+  if (!Buff.readFunctionTag())
+    return false;
+
+  Buff.readInt(); // Function header length
+  Ident = Buff.readInt(); 
+  Buff.readInt(); // Checksum #1
+  if (Format != GCNO_402)
+    Buff.readInt(); // Checksum #2
+
+  Name = Buff.readString();
+  if (Format == GCNO_402 || Format == GCNO_404)
+    Filename = Buff.readString();
+
+  if (Format == GCDA_402 || Format == GCDA_404) {
+    Buff.readArcTag();
+    uint32_t Count = Buff.readInt() / 2;
+    for (unsigned i = 0, e = Count; i != e; ++i) {
+      Blocks[i]->addCount(Buff.readInt64());
+    }
+    return true;;
+  }
+
+  LineNumber = Buff.readInt();
+
+  // read blocks.
+  assert (Buff.readBlockTag() && "Block Tag not found!");
+  uint32_t BlockCount = Buff.readInt();
+  for (int i = 0, e = BlockCount; i != e; ++i) {
+    Buff.readInt(); // Block flags;
+    Blocks.push_back(new GCOVBlock(i));
+  }
+
+  // read edges.
+  while (Buff.readEdgeTag()) {
+    uint32_t EdgeCount = (Buff.readInt() - 1) / 2;
+    uint32_t BlockNo = Buff.readInt();
+    assert (BlockNo < BlockCount && "Unexpected Block number!");
+    for (int i = 0, e = EdgeCount; i != e; ++i) {
+      Blocks[BlockNo]->addEdge(Buff.readInt());
+      Buff.readInt(); // Edge flag
+    }
+  }
+
+  // read line table.
+  while (Buff.readLineTag()) {
+    uint32_t LineTableLength = Buff.readInt();
+    uint32_t Size = Buff.getCursor() + LineTableLength*4;
+    uint32_t BlockNo = Buff.readInt();
+    assert (BlockNo < BlockCount && "Unexpected Block number!");
+    GCOVBlock *Block = Blocks[BlockNo];
+    Buff.readInt(); // flag
+    while (Buff.getCursor() != (Size - 4)) {
+      StringRef Filename = Buff.readString();
+      if (Buff.getCursor() == (Size - 4)) break;
+      while (uint32_t L = Buff.readInt())
+	Block->addLine(Filename, L);
+    }
+    Buff.readInt(); // flag
+  }
+  return true;
+}
+
+/// dump - Dump GCOVFunction content on standard out for debugging purposes.
+void GCOVFunction::dump() {
+  outs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
+  for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
+	 E = Blocks.end(); I != E; ++I)
+    (*I)->dump();
+}
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVFunction::collectLineCounts(FileInfo &FI) {
+  for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
+	 E = Blocks.end(); I != E; ++I)
+    (*I)->collectLineCounts(FI);
+}
+
+//===----------------------------------------------------------------------===//
+// GCOVBlock implementation.
+
+/// ~GCOVBlock - Delete GCOVBlock and its content.
+GCOVBlock::~GCOVBlock() {
+  Edges.clear();
+  DeleteContainerSeconds(Lines);
+}
+
+void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) {
+  GCOVLines *&LinesForFile = Lines[Filename];
+  if (!LinesForFile)
+    LinesForFile = new GCOVLines();
+  LinesForFile->add(LineNo);
+}
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVBlock::collectLineCounts(FileInfo &FI) {
+  for (StringMap<GCOVLines *>::iterator I = Lines.begin(),
+	 E = Lines.end(); I != E; ++I)
+    I->second->collectLineCounts(FI, I->first(), Counter);
+}
+
+/// dump - Dump GCOVBlock content on standard out for debugging purposes.
+void GCOVBlock::dump() {
+  outs() << "Block : " << Number << " Counter : " << Counter << "\n";
+  if (!Edges.empty()) {
+    outs() << "\tEdges : ";
+    for (SmallVector<uint32_t, 16>::iterator I = Edges.begin(), E = Edges.end();
+	 I != E; ++I)
+      outs() << (*I) << ",";
+    outs() << "\n";
+  }
+  if (!Lines.empty()) {
+    outs() << "\tLines : ";
+    for (StringMap<GCOVLines *>::iterator LI = Lines.begin(),
+	   LE = Lines.end(); LI != LE; ++LI) {
+      outs() << LI->first() << " -> ";
+      LI->second->dump();
+      outs() << "\n";
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// GCOVLines implementation.
+
+/// collectLineCounts - Collect line counts. This must be used after
+/// reading .gcno and .gcda files.
+void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, 
+				  uint32_t Count) {
+  for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
+	 E = Lines.end(); I != E; ++I)
+    FI.addLineCount(Filename, *I, Count);
+}
+
+/// dump - Dump GCOVLines content on standard out for debugging purposes.
+void GCOVLines::dump() {
+  for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
+	 E = Lines.end(); I != E; ++I)
+    outs() << (*I) << ",";
+}
+
+//===----------------------------------------------------------------------===//
+// FileInfo implementation.
+
+/// addLineCount - Add line count for the given line number in a file.
+void FileInfo::addLineCount(StringRef Filename, uint32_t Line, uint32_t Count) {
+  if (LineInfo.find(Filename) == LineInfo.end()) {
+    OwningPtr<MemoryBuffer> Buff;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
+      errs() << Filename << ": " << ec.message() << "\n";
+      return;
+    }
+    StringRef AllLines = Buff.take()->getBuffer();
+    LineCounts L(AllLines.count('\n')+2);
+    L[Line-1] = Count;
+    LineInfo[Filename] = L;
+    return;
+  }
+  LineCounts &L = LineInfo[Filename];
+  L[Line-1] = Count;
+}
+
+/// print -  Print source files with collected line count information.
+void FileInfo::print() {
+  for (StringMap<LineCounts>::iterator I = LineInfo.begin(), E = LineInfo.end();
+       I != E; ++I) {
+    StringRef Filename = I->first();
+    outs() << Filename << "\n";
+    LineCounts &L = LineInfo[Filename];
+    OwningPtr<MemoryBuffer> Buff;
+    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
+      errs() << Filename << ": " << ec.message() << "\n";
+      return;
+    }
+    StringRef AllLines = Buff.take()->getBuffer();
+    for (unsigned i = 0, e = L.size(); i != e; ++i) {
+      if (L[i])
+	outs() << L[i] << ":\t";
+      else
+	outs() << " :\t";
+      std::pair<StringRef, StringRef> P = AllLines.split('\n');
+      if (AllLines != P.first)
+	outs() << P.first;
+      outs() << "\n";
+      AllLines = P.second;
+    }
+  }
+}
+
+
diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp
index db008e0..4254fb2 100644
--- a/lib/VMCore/Globals.cpp
+++ b/lib/VMCore/Globals.cpp
@@ -80,7 +80,7 @@ bool GlobalValue::isDeclaration() const {
 // GlobalVariable Implementation
 //===----------------------------------------------------------------------===//
 
-GlobalVariable::GlobalVariable(const Type *Ty, bool constant, LinkageTypes Link,
+GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link,
                                Constant *InitVal, const Twine &Name,
                                bool ThreadLocal, unsigned AddressSpace)
   : GlobalValue(PointerType::get(Ty, AddressSpace), 
@@ -97,7 +97,7 @@ GlobalVariable::GlobalVariable(const Type *Ty, bool constant, LinkageTypes Link,
   LeakDetector::addGarbageObject(this);
 }
 
-GlobalVariable::GlobalVariable(Module &M, const Type *Ty, bool constant,
+GlobalVariable::GlobalVariable(Module &M, Type *Ty, bool constant,
                                LinkageTypes Link, Constant *InitVal,
                                const Twine &Name,
                                GlobalVariable *Before, bool ThreadLocal,
@@ -186,7 +186,7 @@ void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) {
 // GlobalAlias Implementation
 //===----------------------------------------------------------------------===//
 
-GlobalAlias::GlobalAlias(const Type *Ty, LinkageTypes Link,
+GlobalAlias::GlobalAlias(Type *Ty, LinkageTypes Link,
                          const Twine &Name, Constant* aliasee,
                          Module *ParentModule)
   : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name) {
@@ -235,7 +235,7 @@ const GlobalValue *GlobalAlias::getAliasedGlobal() const {
           CE->getOpcode() == Instruction::GetElementPtr) &&
          "Unsupported aliasee");
   
-  return dyn_cast<GlobalValue>(CE->getOperand(0));
+  return cast<GlobalValue>(CE->getOperand(0));
 }
 
 const GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) const {
diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp
index ffe961f..5114e2d 100644
--- a/lib/VMCore/IRBuilder.cpp
+++ b/lib/VMCore/IRBuilder.cpp
@@ -40,7 +40,7 @@ Type *IRBuilderBase::getCurrentFunctionReturnType() const {
 }
 
 Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
-  const PointerType *PT = cast<PointerType>(Ptr->getType());
+  PointerType *PT = cast<PointerType>(Ptr->getType());
   if (PT->getElementType()->isIntegerTy(8))
     return Ptr;
   
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index 4a03b39..736e370 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp
@@ -25,7 +25,7 @@ InlineAsm::~InlineAsm() {
 }
 
 
-InlineAsm *InlineAsm::get(const FunctionType *Ty, StringRef AsmString,
+InlineAsm *InlineAsm::get(FunctionType *Ty, StringRef AsmString,
                           StringRef Constraints, bool hasSideEffects,
                           bool isAlignStack) {
   InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack);
@@ -33,7 +33,7 @@ InlineAsm *InlineAsm::get(const FunctionType *Ty, StringRef AsmString,
   return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(Ty), Key);
 }
 
-InlineAsm::InlineAsm(const PointerType *Ty, const std::string &asmString,
+InlineAsm::InlineAsm(PointerType *Ty, const std::string &asmString,
                      const std::string &constraints, bool hasSideEffects,
                      bool isAlignStack)
   : Value(Ty, Value::InlineAsmVal),
@@ -242,7 +242,7 @@ InlineAsm::ParseConstraints(StringRef Constraints) {
 
 /// Verify - Verify that the specified constraint string is reasonable for the
 /// specified function type, and otherwise validate the constraint string.
-bool InlineAsm::Verify(const FunctionType *Ty, StringRef ConstStr) {
+bool InlineAsm::Verify(FunctionType *Ty, StringRef ConstStr) {
   if (Ty->isVarArg()) return false;
   
   ConstraintInfoVector Constraints = ParseConstraints(ConstStr);
@@ -282,7 +282,7 @@ bool InlineAsm::Verify(const FunctionType *Ty, StringRef ConstStr) {
     if (Ty->getReturnType()->isStructTy()) return false;
     break;
   default:
-    const StructType *STy = dyn_cast<StructType>(Ty->getReturnType());
+    StructType *STy = dyn_cast<StructType>(Ty->getReturnType());
     if (STy == 0 || STy->getNumElements() != NumOutputs)
       return false;
     break;
diff --git a/lib/VMCore/Instruction.cpp b/lib/VMCore/Instruction.cpp
index 02c0757..73191c1 100644
--- a/lib/VMCore/Instruction.cpp
+++ b/lib/VMCore/Instruction.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Support/LeakDetector.h"
 using namespace llvm;
 
-Instruction::Instruction(const Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          Instruction *InsertBefore)
   : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
   // Make sure that we get added to a basicblock
@@ -34,7 +34,7 @@ Instruction::Instruction(const Type *ty, unsigned it, Use *Ops, unsigned NumOps,
   }
 }
 
-Instruction::Instruction(const Type *ty, unsigned it, Use *Ops, unsigned NumOps,
+Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
                          BasicBlock *InsertAtEnd)
   : User(ty, Value::InstructionVal + it, Ops, NumOps), Parent(0) {
   // Make sure that we get added to a basicblock
@@ -101,6 +101,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case Switch: return "switch";
   case IndirectBr: return "indirectbr";
   case Invoke: return "invoke";
+  case Resume: return "resume";
   case Unwind: return "unwind";
   case Unreachable: return "unreachable";
 
@@ -127,6 +128,9 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case Alloca:        return "alloca";
   case Load:          return "load";
   case Store:         return "store";
+  case AtomicCmpXchg: return "cmpxchg";
+  case AtomicRMW:     return "atomicrmw";
+  case Fence:         return "fence";
   case GetElementPtr: return "getelementptr";
 
   // Convert instructions...
@@ -158,6 +162,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case ShuffleVector:  return "shufflevector";
   case ExtractValue:   return "extractvalue";
   case InsertValue:    return "insertvalue";
+  case LandingPad:     return "landingpad";
 
   default: return "<Invalid operator> ";
   }
@@ -191,10 +196,14 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
   // Check special state that is a part of some instructions.
   if (const LoadInst *LI = dyn_cast<LoadInst>(this))
     return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I)->getAlignment();
+           LI->getAlignment() == cast<LoadInst>(I)->getAlignment() &&
+           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
   if (const StoreInst *SI = dyn_cast<StoreInst>(this))
     return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I)->getAlignment();
+           SI->getAlignment() == cast<StoreInst>(I)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
   if (const CmpInst *CI = dyn_cast<CmpInst>(this))
     return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
   if (const CallInst *CI = dyn_cast<CallInst>(this))
@@ -208,6 +217,18 @@ bool Instruction::isIdenticalToWhenDefined(const Instruction *I) const {
     return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
     return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
+    return FI->getOrdering() == cast<FenceInst>(FI)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(FI)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
+           CXI->getOrdering() == cast<AtomicCmpXchgInst>(I)->getOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
 
   return true;
 }
@@ -230,10 +251,14 @@ bool Instruction::isSameOperationAs(const Instruction *I) const {
   // Check special state that is a part of some instructions.
   if (const LoadInst *LI = dyn_cast<LoadInst>(this))
     return LI->isVolatile() == cast<LoadInst>(I)->isVolatile() &&
-           LI->getAlignment() == cast<LoadInst>(I)->getAlignment();
+           LI->getAlignment() == cast<LoadInst>(I)->getAlignment() &&
+           LI->getOrdering() == cast<LoadInst>(I)->getOrdering() &&
+           LI->getSynchScope() == cast<LoadInst>(I)->getSynchScope();
   if (const StoreInst *SI = dyn_cast<StoreInst>(this))
     return SI->isVolatile() == cast<StoreInst>(I)->isVolatile() &&
-           SI->getAlignment() == cast<StoreInst>(I)->getAlignment();
+           SI->getAlignment() == cast<StoreInst>(I)->getAlignment() &&
+           SI->getOrdering() == cast<StoreInst>(I)->getOrdering() &&
+           SI->getSynchScope() == cast<StoreInst>(I)->getSynchScope();
   if (const CmpInst *CI = dyn_cast<CmpInst>(this))
     return CI->getPredicate() == cast<CmpInst>(I)->getPredicate();
   if (const CallInst *CI = dyn_cast<CallInst>(this))
@@ -248,6 +273,18 @@ bool Instruction::isSameOperationAs(const Instruction *I) const {
     return IVI->getIndices() == cast<InsertValueInst>(I)->getIndices();
   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(this))
     return EVI->getIndices() == cast<ExtractValueInst>(I)->getIndices();
+  if (const FenceInst *FI = dyn_cast<FenceInst>(this))
+    return FI->getOrdering() == cast<FenceInst>(I)->getOrdering() &&
+           FI->getSynchScope() == cast<FenceInst>(I)->getSynchScope();
+  if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(this))
+    return CXI->isVolatile() == cast<AtomicCmpXchgInst>(I)->isVolatile() &&
+           CXI->getOrdering() == cast<AtomicCmpXchgInst>(I)->getOrdering() &&
+           CXI->getSynchScope() == cast<AtomicCmpXchgInst>(I)->getSynchScope();
+  if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(this))
+    return RMWI->getOperation() == cast<AtomicRMWInst>(I)->getOperation() &&
+           RMWI->isVolatile() == cast<AtomicRMWInst>(I)->isVolatile() &&
+           RMWI->getOrdering() == cast<AtomicRMWInst>(I)->getOrdering() &&
+           RMWI->getSynchScope() == cast<AtomicRMWInst>(I)->getSynchScope();
 
   return true;
 }
@@ -280,13 +317,16 @@ bool Instruction::mayReadFromMemory() const {
   default: return false;
   case Instruction::VAArg:
   case Instruction::Load:
+  case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
     return true;
   case Instruction::Call:
     return !cast<CallInst>(this)->doesNotAccessMemory();
   case Instruction::Invoke:
     return !cast<InvokeInst>(this)->doesNotAccessMemory();
   case Instruction::Store:
-    return cast<StoreInst>(this)->isVolatile();
+    return !cast<StoreInst>(this)->isUnordered();
   }
 }
 
@@ -295,15 +335,18 @@ bool Instruction::mayReadFromMemory() const {
 bool Instruction::mayWriteToMemory() const {
   switch (getOpcode()) {
   default: return false;
+  case Instruction::Fence: // FIXME: refine definition of mayWriteToMemory
   case Instruction::Store:
   case Instruction::VAArg:
+  case Instruction::AtomicCmpXchg:
+  case Instruction::AtomicRMW:
     return true;
   case Instruction::Call:
     return !cast<CallInst>(this)->onlyReadsMemory();
   case Instruction::Invoke:
     return !cast<InvokeInst>(this)->onlyReadsMemory();
   case Instruction::Load:
-    return cast<LoadInst>(this)->isVolatile();
+    return !cast<LoadInst>(this)->isUnordered();
   }
 }
 
@@ -312,7 +355,7 @@ bool Instruction::mayWriteToMemory() const {
 bool Instruction::mayThrow() const {
   if (const CallInst *CI = dyn_cast<CallInst>(this))
     return !CI->doesNotThrow();
-  return false;
+  return isa<ResumeInst>(this);
 }
 
 /// isAssociative - Return true if the instruction is associative:
@@ -372,7 +415,7 @@ bool Instruction::isSafeToSpeculativelyExecute() const {
   }
   case Load: {
     const LoadInst *LI = cast<LoadInst>(this);
-    if (LI->isVolatile())
+    if (!LI->isUnordered())
       return false;
     return LI->getPointerOperand()->isDereferenceablePointer();
   }
@@ -392,6 +435,11 @@ bool Instruction::isSafeToSpeculativelyExecute() const {
   case Switch:
   case Unwind:
   case Unreachable:
+  case Fence:
+  case LandingPad:
+  case AtomicRMW:
+  case AtomicCmpXchg:
+  case Resume:
     return false; // Misc instructions which have effects
   }
 }
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 9baad09..b3a7205 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -62,11 +62,11 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
   if (Op1->getType() != Op2->getType())
     return "both values to select must have same type";
   
-  if (const VectorType *VT = dyn_cast<VectorType>(Op0->getType())) {
+  if (VectorType *VT = dyn_cast<VectorType>(Op0->getType())) {
     // Vector select.
     if (VT->getElementType() != Type::getInt1Ty(Op0->getContext()))
       return "vector select condition element type must be i1";
-    const VectorType *ET = dyn_cast<VectorType>(Op1->getType());
+    VectorType *ET = dyn_cast<VectorType>(Op1->getType());
     if (ET == 0)
       return "selected values for vector select must be vectors";
     if (ET->getNumElements() != VT->getNumElements())
@@ -166,6 +166,88 @@ Value *PHINode::hasConstantValue() const {
   return ConstantValue;
 }
 
+//===----------------------------------------------------------------------===//
+//                       LandingPadInst Implementation
+//===----------------------------------------------------------------------===//
+
+LandingPadInst::LandingPadInst(Type *RetTy, Value *PersonalityFn,
+                               unsigned NumReservedValues, const Twine &NameStr,
+                               Instruction *InsertBefore)
+  : Instruction(RetTy, Instruction::LandingPad, 0, 0, InsertBefore) {
+  init(PersonalityFn, 1 + NumReservedValues, NameStr);
+}
+
+LandingPadInst::LandingPadInst(Type *RetTy, Value *PersonalityFn,
+                               unsigned NumReservedValues, const Twine &NameStr,
+                               BasicBlock *InsertAtEnd)
+  : Instruction(RetTy, Instruction::LandingPad, 0, 0, InsertAtEnd) {
+  init(PersonalityFn, 1 + NumReservedValues, NameStr);
+}
+
+LandingPadInst::LandingPadInst(const LandingPadInst &LP)
+  : Instruction(LP.getType(), Instruction::LandingPad,
+                allocHungoffUses(LP.getNumOperands()), LP.getNumOperands()),
+    ReservedSpace(LP.getNumOperands()) {
+  Use *OL = OperandList, *InOL = LP.OperandList;
+  for (unsigned I = 0, E = ReservedSpace; I != E; ++I)
+    OL[I] = InOL[I];
+
+  setCleanup(LP.isCleanup());
+}
+
+LandingPadInst::~LandingPadInst() {
+  dropHungoffUses();
+}
+
+LandingPadInst *LandingPadInst::Create(Type *RetTy, Value *PersonalityFn,
+                                       unsigned NumReservedClauses,
+                                       const Twine &NameStr,
+                                       Instruction *InsertBefore) {
+  return new LandingPadInst(RetTy, PersonalityFn, NumReservedClauses, NameStr,
+                            InsertBefore);
+}
+
+LandingPadInst *LandingPadInst::Create(Type *RetTy, Value *PersonalityFn,
+                                       unsigned NumReservedClauses,
+                                       const Twine &NameStr,
+                                       BasicBlock *InsertAtEnd) {
+  return new LandingPadInst(RetTy, PersonalityFn, NumReservedClauses, NameStr,
+                            InsertAtEnd);
+}
+
+void LandingPadInst::init(Value *PersFn, unsigned NumReservedValues,
+                          const Twine &NameStr) {
+  ReservedSpace = NumReservedValues;
+  NumOperands = 1;
+  OperandList = allocHungoffUses(ReservedSpace);
+  OperandList[0] = PersFn;
+  setName(NameStr);
+  setCleanup(false);
+}
+
+/// growOperands - grow operands - This grows the operand list in response to a
+/// push_back style of operation. This grows the number of ops by 2 times.
+void LandingPadInst::growOperands(unsigned Size) {
+  unsigned e = getNumOperands();
+  if (ReservedSpace >= e + Size) return;
+  ReservedSpace = (e + Size / 2) * 2;
+
+  Use *NewOps = allocHungoffUses(ReservedSpace);
+  Use *OldOps = OperandList;
+  for (unsigned i = 0; i != e; ++i)
+      NewOps[i] = OldOps[i];
+
+  OperandList = NewOps;
+  Use::zap(OldOps, OldOps + e, true);
+}
+
+void LandingPadInst::addClause(Value *Val) {
+  unsigned OpNo = getNumOperands();
+  growOperands(1);
+  assert(OpNo < ReservedSpace && "Growing didn't work!");
+  ++NumOperands;
+  OperandList[OpNo] = Val;
+}
 
 //===----------------------------------------------------------------------===//
 //                        CallInst Implementation
@@ -179,7 +261,7 @@ void CallInst::init(Value *Func, ArrayRef<Value *> Args, const Twine &NameStr) {
   Op<-1>() = Func;
 
 #ifndef NDEBUG
-  const FunctionType *FTy =
+  FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
 
   assert((Args.size() == FTy->getNumParams() ||
@@ -201,7 +283,7 @@ void CallInst::init(Value *Func, const Twine &NameStr) {
   Op<-1>() = Func;
 
 #ifndef NDEBUG
-  const FunctionType *FTy =
+  FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Func->getType())->getElementType());
 
   assert(FTy->getNumParams() == 0 && "Calling a function with bad signature");
@@ -269,8 +351,8 @@ static bool IsConstantOne(Value *val) {
 }
 
 static Instruction *createMalloc(Instruction *InsertBefore,
-                                 BasicBlock *InsertAtEnd, const Type *IntPtrTy,
-                                 const Type *AllocTy, Value *AllocSize, 
+                                 BasicBlock *InsertAtEnd, Type *IntPtrTy,
+                                 Type *AllocTy, Value *AllocSize, 
                                  Value *ArraySize, Function *MallocF,
                                  const Twine &Name) {
   assert(((!InsertBefore && InsertAtEnd) || (InsertBefore && !InsertAtEnd)) &&
@@ -319,7 +401,7 @@ static Instruction *createMalloc(Instruction *InsertBefore,
   if (!MallocFunc)
     // prototype malloc as "void *malloc(size_t)"
     MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, NULL);
-  const PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
+  PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
   CallInst *MCall = NULL;
   Instruction *Result = NULL;
   if (InsertBefore) {
@@ -354,7 +436,7 @@ static Instruction *createMalloc(Instruction *InsertBefore,
 /// 2. Call malloc with that argument.
 /// 3. Bitcast the result of the malloc call to the specified type.
 Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
-                                    const Type *IntPtrTy, const Type *AllocTy,
+                                    Type *IntPtrTy, Type *AllocTy,
                                     Value *AllocSize, Value *ArraySize,
                                     Function * MallocF,
                                     const Twine &Name) {
@@ -371,7 +453,7 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore,
 /// Note: This function does not add the bitcast to the basic block, that is the
 /// responsibility of the caller.
 Instruction *CallInst::CreateMalloc(BasicBlock *InsertAtEnd,
-                                    const Type *IntPtrTy, const Type *AllocTy,
+                                    Type *IntPtrTy, Type *AllocTy,
                                     Value *AllocSize, Value *ArraySize, 
                                     Function *MallocF, const Twine &Name) {
   return createMalloc(NULL, InsertAtEnd, IntPtrTy, AllocTy, AllocSize,
@@ -388,8 +470,8 @@ static Instruction* createFree(Value* Source, Instruction *InsertBefore,
   BasicBlock* BB = InsertBefore ? InsertBefore->getParent() : InsertAtEnd;
   Module* M = BB->getParent()->getParent();
 
-  const Type *VoidTy = Type::getVoidTy(M->getContext());
-  const Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
+  Type *VoidTy = Type::getVoidTy(M->getContext());
+  Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
   // prototype free as "void free(void*)"
   Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, NULL);
   CallInst* Result = NULL;
@@ -436,7 +518,7 @@ void InvokeInst::init(Value *Fn, BasicBlock *IfNormal, BasicBlock *IfException,
   Op<-1>() = IfException;
 
 #ifndef NDEBUG
-  const FunctionType *FTy =
+  FunctionType *FTy =
     cast<FunctionType>(cast<PointerType>(Fn->getType())->getElementType());
 
   assert(((Args.size() == FTy->getNumParams()) ||
@@ -494,6 +576,9 @@ void InvokeInst::removeAttribute(unsigned i, Attributes attr) {
   setAttributes(PAL);
 }
 
+LandingPadInst *InvokeInst::getLandingPadInst() const {
+  return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
+}
 
 //===----------------------------------------------------------------------===//
 //                        ReturnInst Implementation
@@ -574,6 +659,41 @@ BasicBlock *UnwindInst::getSuccessorV(unsigned idx) const {
 }
 
 //===----------------------------------------------------------------------===//
+//                        ResumeInst Implementation
+//===----------------------------------------------------------------------===//
+
+ResumeInst::ResumeInst(const ResumeInst &RI)
+  : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Resume,
+                   OperandTraits<ResumeInst>::op_begin(this), 1) {
+  Op<0>() = RI.Op<0>();
+}
+
+ResumeInst::ResumeInst(Value *Exn, Instruction *InsertBefore)
+  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertBefore) {
+  Op<0>() = Exn;
+}
+
+ResumeInst::ResumeInst(Value *Exn, BasicBlock *InsertAtEnd)
+  : TerminatorInst(Type::getVoidTy(Exn->getContext()), Instruction::Resume,
+                   OperandTraits<ResumeInst>::op_begin(this), 1, InsertAtEnd) {
+  Op<0>() = Exn;
+}
+
+unsigned ResumeInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+void ResumeInst::setSuccessorV(unsigned idx, BasicBlock *NewSucc) {
+  llvm_unreachable("ResumeInst has no successors!");
+}
+
+BasicBlock *ResumeInst::getSuccessorV(unsigned idx) const {
+  llvm_unreachable("ResumeInst has no successors!");
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
 //                      UnreachableInst Implementation
 //===----------------------------------------------------------------------===//
 
@@ -665,6 +785,27 @@ BranchInst::BranchInst(const BranchInst &BI) :
   SubclassOptionalData = BI.SubclassOptionalData;
 }
 
+void BranchInst::swapSuccessors() {
+  assert(isConditional() &&
+         "Cannot swap successors of an unconditional branch");
+  Op<-1>().swap(Op<-2>());
+
+  // Update profile metadata if present and it matches our structural
+  // expectations.
+  MDNode *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3)
+    return;
+
+  // The first operand is the name. Fetch them backwards and build a new one.
+  Value *Ops[] = {
+    ProfileData->getOperand(0),
+    ProfileData->getOperand(2),
+    ProfileData->getOperand(1)
+  };
+  setMetadata(LLVMContext::MD_prof,
+              MDNode::get(ProfileData->getContext(), Ops));
+}
+
 BasicBlock *BranchInst::getSuccessorV(unsigned idx) const {
   return getSuccessor(idx);
 }
@@ -692,7 +833,7 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
   return Amt;
 }
 
-AllocaInst::AllocaInst(const Type *Ty, Value *ArraySize,
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize,
                        const Twine &Name, Instruction *InsertBefore)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
                      getAISize(Ty->getContext(), ArraySize), InsertBefore) {
@@ -701,7 +842,7 @@ AllocaInst::AllocaInst(const Type *Ty, Value *ArraySize,
   setName(Name);
 }
 
-AllocaInst::AllocaInst(const Type *Ty, Value *ArraySize,
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize,
                        const Twine &Name, BasicBlock *InsertAtEnd)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
                      getAISize(Ty->getContext(), ArraySize), InsertAtEnd) {
@@ -710,7 +851,7 @@ AllocaInst::AllocaInst(const Type *Ty, Value *ArraySize,
   setName(Name);
 }
 
-AllocaInst::AllocaInst(const Type *Ty, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, const Twine &Name,
                        Instruction *InsertBefore)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
                      getAISize(Ty->getContext(), 0), InsertBefore) {
@@ -719,7 +860,7 @@ AllocaInst::AllocaInst(const Type *Ty, const Twine &Name,
   setName(Name);
 }
 
-AllocaInst::AllocaInst(const Type *Ty, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, const Twine &Name,
                        BasicBlock *InsertAtEnd)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
                      getAISize(Ty->getContext(), 0), InsertAtEnd) {
@@ -728,7 +869,7 @@ AllocaInst::AllocaInst(const Type *Ty, const Twine &Name,
   setName(Name);
 }
 
-AllocaInst::AllocaInst(const Type *Ty, Value *ArraySize, unsigned Align,
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
                        const Twine &Name, Instruction *InsertBefore)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
                      getAISize(Ty->getContext(), ArraySize), InsertBefore) {
@@ -737,7 +878,7 @@ AllocaInst::AllocaInst(const Type *Ty, Value *ArraySize, unsigned Align,
   setName(Name);
 }
 
-AllocaInst::AllocaInst(const Type *Ty, Value *ArraySize, unsigned Align,
+AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
                        const Twine &Name, BasicBlock *InsertAtEnd)
   : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
                      getAISize(Ty->getContext(), ArraySize), InsertAtEnd) {
@@ -787,6 +928,8 @@ bool AllocaInst::isStaticAlloca() const {
 void LoadInst::AssertOK() {
   assert(getOperand(0)->getType()->isPointerTy() &&
          "Ptr must have pointer type.");
+  assert(!(isAtomic() && getAlignment() == 0) &&
+         "Alignment required for atomic load");
 }
 
 LoadInst::LoadInst(Value *Ptr, const Twine &Name, Instruction *InsertBef)
@@ -794,6 +937,7 @@ LoadInst::LoadInst(Value *Ptr, const Twine &Name, Instruction *InsertBef)
                      Load, Ptr, InsertBef) {
   setVolatile(false);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
   setName(Name);
 }
@@ -803,6 +947,7 @@ LoadInst::LoadInst(Value *Ptr, const Twine &Name, BasicBlock *InsertAE)
                      Load, Ptr, InsertAE) {
   setVolatile(false);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
   setName(Name);
 }
@@ -813,6 +958,18 @@ LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
                      Load, Ptr, InsertBef) {
   setVolatile(isVolatile);
   setAlignment(0);
+  setAtomic(NotAtomic);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
+                   BasicBlock *InsertAE)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertAE) {
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
   setName(Name);
 }
@@ -823,6 +980,7 @@ LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
                      Load, Ptr, InsertBef) {
   setVolatile(isVolatile);
   setAlignment(Align);
+  setAtomic(NotAtomic);
   AssertOK();
   setName(Name);
 }
@@ -833,27 +991,43 @@ LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
                      Load, Ptr, InsertAE) {
   setVolatile(isVolatile);
   setAlignment(Align);
+  setAtomic(NotAtomic);
   AssertOK();
   setName(Name);
 }
 
-LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile,
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
+                   unsigned Align, AtomicOrdering Order,
+                   SynchronizationScope SynchScope,
+                   Instruction *InsertBef)
+  : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
+                     Load, Ptr, InsertBef) {
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
+  AssertOK();
+  setName(Name);
+}
+
+LoadInst::LoadInst(Value *Ptr, const Twine &Name, bool isVolatile, 
+                   unsigned Align, AtomicOrdering Order,
+                   SynchronizationScope SynchScope,
                    BasicBlock *InsertAE)
   : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
                      Load, Ptr, InsertAE) {
   setVolatile(isVolatile);
-  setAlignment(0);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
   AssertOK();
   setName(Name);
 }
 
-
-
 LoadInst::LoadInst(Value *Ptr, const char *Name, Instruction *InsertBef)
   : UnaryInstruction(cast<PointerType>(Ptr->getType())->getElementType(),
                      Load, Ptr, InsertBef) {
   setVolatile(false);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -863,6 +1037,7 @@ LoadInst::LoadInst(Value *Ptr, const char *Name, BasicBlock *InsertAE)
                      Load, Ptr, InsertAE) {
   setVolatile(false);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -873,6 +1048,7 @@ LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
                    Load, Ptr, InsertBef) {
   setVolatile(isVolatile);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -883,6 +1059,7 @@ LoadInst::LoadInst(Value *Ptr, const char *Name, bool isVolatile,
                      Load, Ptr, InsertAE) {
   setVolatile(isVolatile);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
   if (Name && Name[0]) setName(Name);
 }
@@ -891,7 +1068,7 @@ void LoadInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
   assert(Align <= MaximumAlignment &&
          "Alignment is greater than MaximumAlignment!");
-  setInstructionSubclassData((getSubclassDataFromInstruction() & 1) |
+  setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
                              ((Log2_32(Align)+1)<<1));
   assert(getAlignment() == Align && "Alignment representation error!");
 }
@@ -907,6 +1084,8 @@ void StoreInst::AssertOK() {
   assert(getOperand(0)->getType() ==
                  cast<PointerType>(getOperand(1)->getType())->getElementType()
          && "Ptr must be a pointer to Val type!");
+  assert(!(isAtomic() && getAlignment() == 0) &&
+         "Alignment required for atomic load");
 }
 
 
@@ -919,6 +1098,7 @@ StoreInst::StoreInst(Value *val, Value *addr, Instruction *InsertBefore)
   Op<1>() = addr;
   setVolatile(false);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
 }
 
@@ -931,6 +1111,7 @@ StoreInst::StoreInst(Value *val, Value *addr, BasicBlock *InsertAtEnd)
   Op<1>() = addr;
   setVolatile(false);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
 }
 
@@ -944,6 +1125,7 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   Op<1>() = addr;
   setVolatile(isVolatile);
   setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
 }
 
@@ -957,6 +1139,37 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   Op<1>() = addr;
   setVolatile(isVolatile);
   setAlignment(Align);
+  setAtomic(NotAtomic);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, AtomicOrdering Order,
+                     SynchronizationScope SynchScope,
+                     Instruction *InsertBefore)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertBefore) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
+  AssertOK();
+}
+
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     BasicBlock *InsertAtEnd)
+  : Instruction(Type::getVoidTy(val->getContext()), Store,
+                OperandTraits<StoreInst>::op_begin(this),
+                OperandTraits<StoreInst>::operands(this),
+                InsertAtEnd) {
+  Op<0>() = val;
+  Op<1>() = addr;
+  setVolatile(isVolatile);
+  setAlignment(0);
+  setAtomic(NotAtomic);
   AssertOK();
 }
 
@@ -970,10 +1183,13 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   Op<1>() = addr;
   setVolatile(isVolatile);
   setAlignment(Align);
+  setAtomic(NotAtomic);
   AssertOK();
 }
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
+                     unsigned Align, AtomicOrdering Order,
+                     SynchronizationScope SynchScope,
                      BasicBlock *InsertAtEnd)
   : Instruction(Type::getVoidTy(val->getContext()), Store,
                 OperandTraits<StoreInst>::op_begin(this),
@@ -982,7 +1198,8 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
   Op<0>() = val;
   Op<1>() = addr;
   setVolatile(isVolatile);
-  setAlignment(0);
+  setAlignment(Align);
+  setAtomic(Order, SynchScope);
   AssertOK();
 }
 
@@ -990,37 +1207,135 @@ void StoreInst::setAlignment(unsigned Align) {
   assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!");
   assert(Align <= MaximumAlignment &&
          "Alignment is greater than MaximumAlignment!");
-  setInstructionSubclassData((getSubclassDataFromInstruction() & 1) |
+  setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
                              ((Log2_32(Align)+1) << 1));
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
 //===----------------------------------------------------------------------===//
-//                       GetElementPtrInst Implementation
+//                       AtomicCmpXchgInst Implementation
 //===----------------------------------------------------------------------===//
 
-static unsigned retrieveAddrSpace(const Value *Val) {
-  return cast<PointerType>(Val->getType())->getAddressSpace();
+void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
+                             AtomicOrdering Ordering,
+                             SynchronizationScope SynchScope) {
+  Op<0>() = Ptr;
+  Op<1>() = Cmp;
+  Op<2>() = NewVal;
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
+
+  assert(getOperand(0) && getOperand(1) && getOperand(2) &&
+         "All operands must be non-null!");
+  assert(getOperand(0)->getType()->isPointerTy() &&
+         "Ptr must have pointer type!");
+  assert(getOperand(1)->getType() ==
+                 cast<PointerType>(getOperand(0)->getType())->getElementType()
+         && "Ptr must be a pointer to Cmp type!");
+  assert(getOperand(2)->getType() ==
+                 cast<PointerType>(getOperand(0)->getType())->getElementType()
+         && "Ptr must be a pointer to NewVal type!");
+  assert(Ordering != NotAtomic &&
+         "AtomicCmpXchg instructions must be atomic!");
+}
+
+AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
+                                     AtomicOrdering Ordering,
+                                     SynchronizationScope SynchScope,
+                                     Instruction *InsertBefore)
+  : Instruction(Cmp->getType(), AtomicCmpXchg,
+                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+                OperandTraits<AtomicCmpXchgInst>::operands(this),
+                InsertBefore) {
+  Init(Ptr, Cmp, NewVal, Ordering, SynchScope);
 }
 
-void GetElementPtrInst::init(Value *Ptr, Value* const *Idx, unsigned NumIdx,
-                             const Twine &Name) {
-  assert(NumOperands == 1+NumIdx && "NumOperands not initialized?");
-  Use *OL = OperandList;
-  OL[0] = Ptr;
+AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
+                                     AtomicOrdering Ordering,
+                                     SynchronizationScope SynchScope,
+                                     BasicBlock *InsertAtEnd)
+  : Instruction(Cmp->getType(), AtomicCmpXchg,
+                OperandTraits<AtomicCmpXchgInst>::op_begin(this),
+                OperandTraits<AtomicCmpXchgInst>::operands(this),
+                InsertAtEnd) {
+  Init(Ptr, Cmp, NewVal, Ordering, SynchScope);
+}
+ 
+//===----------------------------------------------------------------------===//
+//                       AtomicRMWInst Implementation
+//===----------------------------------------------------------------------===//
 
-  for (unsigned i = 0; i != NumIdx; ++i)
-    OL[i+1] = Idx[i];
+void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
+                         AtomicOrdering Ordering,
+                         SynchronizationScope SynchScope) {
+  Op<0>() = Ptr;
+  Op<1>() = Val;
+  setOperation(Operation);
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
 
-  setName(Name);
+  assert(getOperand(0) && getOperand(1) &&
+         "All operands must be non-null!");
+  assert(getOperand(0)->getType()->isPointerTy() &&
+         "Ptr must have pointer type!");
+  assert(getOperand(1)->getType() ==
+         cast<PointerType>(getOperand(0)->getType())->getElementType()
+         && "Ptr must be a pointer to Val type!");
+  assert(Ordering != NotAtomic &&
+         "AtomicRMW instructions must be atomic!");
 }
 
-void GetElementPtrInst::init(Value *Ptr, Value *Idx, const Twine &Name) {
-  assert(NumOperands == 2 && "NumOperands not initialized?");
-  Use *OL = OperandList;
-  OL[0] = Ptr;
-  OL[1] = Idx;
+AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
+                             AtomicOrdering Ordering,
+                             SynchronizationScope SynchScope,
+                             Instruction *InsertBefore)
+  : Instruction(Val->getType(), AtomicRMW,
+                OperandTraits<AtomicRMWInst>::op_begin(this),
+                OperandTraits<AtomicRMWInst>::operands(this),
+                InsertBefore) {
+  Init(Operation, Ptr, Val, Ordering, SynchScope);
+}
+
+AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
+                             AtomicOrdering Ordering,
+                             SynchronizationScope SynchScope,
+                             BasicBlock *InsertAtEnd)
+  : Instruction(Val->getType(), AtomicRMW,
+                OperandTraits<AtomicRMWInst>::op_begin(this),
+                OperandTraits<AtomicRMWInst>::operands(this),
+                InsertAtEnd) {
+  Init(Operation, Ptr, Val, Ordering, SynchScope);
+}
+
+//===----------------------------------------------------------------------===//
+//                       FenceInst Implementation
+//===----------------------------------------------------------------------===//
+
+FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
+                     SynchronizationScope SynchScope,
+                     Instruction *InsertBefore)
+  : Instruction(Type::getVoidTy(C), Fence, 0, 0, InsertBefore) {
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
+}
 
+FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, 
+                     SynchronizationScope SynchScope,
+                     BasicBlock *InsertAtEnd)
+  : Instruction(Type::getVoidTy(C), Fence, 0, 0, InsertAtEnd) {
+  setOrdering(Ordering);
+  setSynchScope(SynchScope);
+}
+
+//===----------------------------------------------------------------------===//
+//                       GetElementPtrInst Implementation
+//===----------------------------------------------------------------------===//
+
+void GetElementPtrInst::init(Value *Ptr, ArrayRef<Value *> IdxList,
+                             const Twine &Name) {
+  assert(NumOperands == 1 + IdxList.size() && "NumOperands not initialized?");
+  OperandList[0] = Ptr;
+  std::copy(IdxList.begin(), IdxList.end(), op_begin() + 1);
   setName(Name);
 }
 
@@ -1029,34 +1344,10 @@ GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI)
                 OperandTraits<GetElementPtrInst>::op_end(this)
                 - GEPI.getNumOperands(),
                 GEPI.getNumOperands()) {
-  Use *OL = OperandList;
-  Use *GEPIOL = GEPI.OperandList;
-  for (unsigned i = 0, E = NumOperands; i != E; ++i)
-    OL[i] = GEPIOL[i];
+  std::copy(GEPI.op_begin(), GEPI.op_end(), op_begin());
   SubclassOptionalData = GEPI.SubclassOptionalData;
 }
 
-GetElementPtrInst::GetElementPtrInst(Value *Ptr, Value *Idx,
-                                     const Twine &Name, Instruction *InBe)
-  : Instruction(PointerType::get(
-      checkGEPType(getIndexedType(Ptr->getType(),Idx)), retrieveAddrSpace(Ptr)),
-                GetElementPtr,
-                OperandTraits<GetElementPtrInst>::op_end(this) - 2,
-                2, InBe) {
-  init(Ptr, Idx, Name);
-}
-
-GetElementPtrInst::GetElementPtrInst(Value *Ptr, Value *Idx,
-                                     const Twine &Name, BasicBlock *IAE)
-  : Instruction(PointerType::get(
-            checkGEPType(getIndexedType(Ptr->getType(),Idx)),  
-                retrieveAddrSpace(Ptr)),
-                GetElementPtr,
-                OperandTraits<GetElementPtrInst>::op_end(this) - 2,
-                2, IAE) {
-  init(Ptr, Idx, Name);
-}
-
 /// getIndexedType - Returns the type of the element that would be accessed with
 /// a gep instruction with the specified parameters.
 ///
@@ -1067,14 +1358,13 @@ GetElementPtrInst::GetElementPtrInst(Value *Ptr, Value *Idx,
 /// pointer type.
 ///
 template <typename IndexTy>
-static Type *getIndexedTypeInternal(const Type *Ptr, IndexTy const *Idxs,
-                                    unsigned NumIdx) {
-  const PointerType *PTy = dyn_cast<PointerType>(Ptr);
+static Type *getIndexedTypeInternal(Type *Ptr, ArrayRef<IndexTy> IdxList) {
+  PointerType *PTy = dyn_cast<PointerType>(Ptr);
   if (!PTy) return 0;   // Type isn't a pointer type!
   Type *Agg = PTy->getElementType();
 
   // Handle the special case of the empty set index set, which is always valid.
-  if (NumIdx == 0)
+  if (IdxList.empty())
     return Agg;
   
   // If there is at least one index, the top level type must be sized, otherwise
@@ -1083,44 +1373,29 @@ static Type *getIndexedTypeInternal(const Type *Ptr, IndexTy const *Idxs,
     return 0;
 
   unsigned CurIdx = 1;
-  for (; CurIdx != NumIdx; ++CurIdx) {
+  for (; CurIdx != IdxList.size(); ++CurIdx) {
     CompositeType *CT = dyn_cast<CompositeType>(Agg);
     if (!CT || CT->isPointerTy()) return 0;
-    IndexTy Index = Idxs[CurIdx];
+    IndexTy Index = IdxList[CurIdx];
     if (!CT->indexValid(Index)) return 0;
     Agg = CT->getTypeAtIndex(Index);
   }
-  return CurIdx == NumIdx ? Agg : 0;
+  return CurIdx == IdxList.size() ? Agg : 0;
 }
 
-Type *GetElementPtrInst::getIndexedType(const Type *Ptr, Value* const *Idxs,
-                                        unsigned NumIdx) {
-  return getIndexedTypeInternal(Ptr, Idxs, NumIdx);
+Type *GetElementPtrInst::getIndexedType(Type *Ptr, ArrayRef<Value *> IdxList) {
+  return getIndexedTypeInternal(Ptr, IdxList);
 }
 
-Type *GetElementPtrInst::getIndexedType(const Type *Ptr,
-                                        Constant* const *Idxs,
-                                        unsigned NumIdx) {
-  return getIndexedTypeInternal(Ptr, Idxs, NumIdx);
+Type *GetElementPtrInst::getIndexedType(Type *Ptr,
+                                        ArrayRef<Constant *> IdxList) {
+  return getIndexedTypeInternal(Ptr, IdxList);
 }
 
-Type *GetElementPtrInst::getIndexedType(const Type *Ptr,
-                                        uint64_t const *Idxs,
-                                        unsigned NumIdx) {
-  return getIndexedTypeInternal(Ptr, Idxs, NumIdx);
+Type *GetElementPtrInst::getIndexedType(Type *Ptr, ArrayRef<uint64_t> IdxList) {
+  return getIndexedTypeInternal(Ptr, IdxList);
 }
 
-Type *GetElementPtrInst::getIndexedType(const Type *Ptr, Value *Idx) {
-  const PointerType *PTy = dyn_cast<PointerType>(Ptr);
-  if (!PTy) return 0;   // Type isn't a pointer type!
-
-  // Check the pointer index.
-  if (!PTy->indexValid(Idx)) return 0;
-
-  return PTy->getElementType();
-}
-
-
 /// hasAllZeroIndices - Return true if all of the indices of this GEP are
 /// zeros.  If so, the result pointer and the first operand have the same
 /// value, just potentially different types.
@@ -1286,13 +1561,13 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
   if (!V1->getType()->isVectorTy() || V1->getType() != V2->getType())
     return false;
   
-  const VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
+  VectorType *MaskTy = dyn_cast<VectorType>(Mask->getType());
   if (MaskTy == 0 || !MaskTy->getElementType()->isIntegerTy(32))
     return false;
 
   // Check to see if Mask is valid.
   if (const ConstantVector *MV = dyn_cast<ConstantVector>(Mask)) {
-    const VectorType *VTy = cast<VectorType>(V1->getType());
+    VectorType *VTy = cast<VectorType>(V1->getType());
     for (unsigned i = 0, e = MV->getNumOperands(); i != e; ++i) {
       if (ConstantInt* CI = dyn_cast<ConstantInt>(MV->getOperand(i))) {
         if (CI->uge(VTy->getNumElements()*2))
@@ -1382,7 +1657,7 @@ ExtractValueInst::ExtractValueInst(const ExtractValueInst &EVI)
 // A null type is returned if the indices are invalid for the specified
 // pointer type.
 //
-Type *ExtractValueInst::getIndexedType(const Type *Agg,
+Type *ExtractValueInst::getIndexedType(Type *Agg,
                                        ArrayRef<unsigned> Idxs) {
   for (unsigned CurIdx = 0; CurIdx != Idxs.size(); ++CurIdx) {
     unsigned Index = Idxs[CurIdx];
@@ -1392,10 +1667,10 @@ Type *ExtractValueInst::getIndexedType(const Type *Agg,
     // insertvalue we need to check array indexing manually.
     // Since the only other types we can index into are struct types it's just
     // as easy to check those manually as well.
-    if (const ArrayType *AT = dyn_cast<ArrayType>(Agg)) {
+    if (ArrayType *AT = dyn_cast<ArrayType>(Agg)) {
       if (Index >= AT->getNumElements())
         return 0;
-    } else if (const StructType *ST = dyn_cast<StructType>(Agg)) {
+    } else if (StructType *ST = dyn_cast<StructType>(Agg)) {
       if (Index >= ST->getNumElements())
         return 0;
     } else {
@@ -1413,7 +1688,7 @@ Type *ExtractValueInst::getIndexedType(const Type *Agg,
 //===----------------------------------------------------------------------===//
 
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
-                               const Type *Ty, const Twine &Name,
+                               Type *Ty, const Twine &Name,
                                Instruction *InsertBefore)
   : Instruction(Ty, iType,
                 OperandTraits<BinaryOperator>::op_begin(this),
@@ -1426,7 +1701,7 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2,
 }
 
 BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, 
-                               const Type *Ty, const Twine &Name,
+                               Type *Ty, const Twine &Name,
                                BasicBlock *InsertAtEnd)
   : Instruction(Ty, iType,
                 OperandTraits<BinaryOperator>::op_begin(this),
@@ -1589,7 +1864,7 @@ BinaryOperator *BinaryOperator::CreateFNeg(Value *Op, const Twine &Name,
 BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                                           Instruction *InsertBefore) {
   Constant *C;
-  if (const VectorType *PTy = dyn_cast<VectorType>(Op->getType())) {
+  if (VectorType *PTy = dyn_cast<VectorType>(Op->getType())) {
     C = Constant::getAllOnesValue(PTy->getElementType());
     C = ConstantVector::get(
                               std::vector<Constant*>(PTy->getNumElements(), C));
@@ -1604,7 +1879,7 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
 BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                                           BasicBlock *InsertAtEnd) {
   Constant *AllOnes;
-  if (const VectorType *PTy = dyn_cast<VectorType>(Op->getType())) {
+  if (VectorType *PTy = dyn_cast<VectorType>(Op->getType())) {
     // Create a vector of all ones values.
     Constant *Elt = Constant::getAllOnesValue(PTy->getElementType());
     AllOnes = ConstantVector::get(
@@ -1743,8 +2018,8 @@ bool CastInst::isLosslessCast() const {
     return false;
 
   // Identity cast is always lossless
-  const Type* SrcTy = getOperand(0)->getType();
-  const Type* DstTy = getType();
+  Type* SrcTy = getOperand(0)->getType();
+  Type* DstTy = getType();
   if (SrcTy == DstTy)
     return true;
   
@@ -1763,12 +2038,12 @@ bool CastInst::isLosslessCast() const {
 /// # ptrtoint i32* %x to i32     ; on 32-bit plaforms only
 /// @brief Determine if the described cast is a no-op.
 bool CastInst::isNoopCast(Instruction::CastOps Opcode,
-                          const Type *SrcTy,
-                          const Type *DestTy,
-                          const Type *IntPtrTy) {
+                          Type *SrcTy,
+                          Type *DestTy,
+                          Type *IntPtrTy) {
   switch (Opcode) {
     default:
-      assert(!"Invalid CastOp");
+      assert(0 && "Invalid CastOp");
     case Instruction::Trunc:
     case Instruction::ZExt:
     case Instruction::SExt: 
@@ -1791,7 +2066,7 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
 }
 
 /// @brief Determine if a cast is a no-op.
-bool CastInst::isNoopCast(const Type *IntPtrTy) const {
+bool CastInst::isNoopCast(Type *IntPtrTy) const {
   return isNoopCast(getOpcode(), getOperand(0)->getType(), getType(), IntPtrTy);
 }
 
@@ -1805,8 +2080,7 @@ bool CastInst::isNoopCast(const Type *IntPtrTy) const {
 /// If no such cast is permited, the function returns 0.
 unsigned CastInst::isEliminableCastPair(
   Instruction::CastOps firstOp, Instruction::CastOps secondOp,
-  const Type *SrcTy, const Type *MidTy, const Type *DstTy, const Type *IntPtrTy)
-{
+  Type *SrcTy, Type *MidTy, Type *DstTy, Type *IntPtrTy) {
   // Define the 144 possibilities for these two cast instructions. The values
   // in this matrix determine what to do in a given situation and select the
   // case in the switch below.  The rows correspond to firstOp, the columns 
@@ -1859,12 +2133,16 @@ unsigned CastInst::isEliminableCastPair(
   };
   
   // If either of the casts are a bitcast from scalar to vector, disallow the
-  // merging.
-  if ((firstOp == Instruction::BitCast &&
-       isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) ||
-      (secondOp == Instruction::BitCast &&
-       isa<VectorType>(MidTy) != isa<VectorType>(DstTy)))
-    return 0; // Disallowed
+  // merging. However, bitcast of A->B->A are allowed.
+  bool isFirstBitcast  = (firstOp == Instruction::BitCast);
+  bool isSecondBitcast = (secondOp == Instruction::BitCast);
+  bool chainedBitcast  = (SrcTy == DstTy && isFirstBitcast && isSecondBitcast);
+
+  // Check if any of the bitcasts convert scalars<->vectors.
+  if ((isFirstBitcast  && isa<VectorType>(SrcTy) != isa<VectorType>(MidTy)) ||
+      (isSecondBitcast && isa<VectorType>(MidTy) != isa<VectorType>(DstTy)))
+    // Unless we are bitcasing to the original type, disallow optimizations.
+    if (!chainedBitcast) return 0;
 
   int ElimCase = CastResults[firstOp-Instruction::CastOpsBegin]
                             [secondOp-Instruction::CastOpsBegin];
@@ -1958,16 +2236,16 @@ unsigned CastInst::isEliminableCastPair(
     case 99: 
       // cast combination can't happen (error in input). This is for all cases
       // where the MidTy is not the same for the two cast instructions.
-      assert(!"Invalid Cast Combination");
+      assert(0 && "Invalid Cast Combination");
       return 0;
     default:
-      assert(!"Error in CastResults table!!!");
+      assert(0 && "Error in CastResults table!!!");
       return 0;
   }
   return 0;
 }
 
-CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty, 
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty, 
   const Twine &Name, Instruction *InsertBefore) {
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
@@ -1985,12 +2263,12 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
     case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
     case BitCast:  return new BitCastInst  (S, Ty, Name, InsertBefore);
     default:
-      assert(!"Invalid opcode provided");
+      assert(0 && "Invalid opcode provided");
   }
   return 0;
 }
 
-CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
+CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   const Twine &Name, BasicBlock *InsertAtEnd) {
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
@@ -2008,12 +2286,12 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, const Type *Ty,
     case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertAtEnd);
     case BitCast:  return new BitCastInst  (S, Ty, Name, InsertAtEnd);
     default:
-      assert(!"Invalid opcode provided");
+      assert(0 && "Invalid opcode provided");
   }
   return 0;
 }
 
-CastInst *CastInst::CreateZExtOrBitCast(Value *S, const Type *Ty, 
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, 
                                         const Twine &Name,
                                         Instruction *InsertBefore) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2021,7 +2299,7 @@ CastInst *CastInst::CreateZExtOrBitCast(Value *S, const Type *Ty,
   return Create(Instruction::ZExt, S, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateZExtOrBitCast(Value *S, const Type *Ty, 
+CastInst *CastInst::CreateZExtOrBitCast(Value *S, Type *Ty, 
                                         const Twine &Name,
                                         BasicBlock *InsertAtEnd) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2029,7 +2307,7 @@ CastInst *CastInst::CreateZExtOrBitCast(Value *S, const Type *Ty,
   return Create(Instruction::ZExt, S, Ty, Name, InsertAtEnd);
 }
 
-CastInst *CastInst::CreateSExtOrBitCast(Value *S, const Type *Ty, 
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, 
                                         const Twine &Name,
                                         Instruction *InsertBefore) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2037,7 +2315,7 @@ CastInst *CastInst::CreateSExtOrBitCast(Value *S, const Type *Ty,
   return Create(Instruction::SExt, S, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateSExtOrBitCast(Value *S, const Type *Ty, 
+CastInst *CastInst::CreateSExtOrBitCast(Value *S, Type *Ty, 
                                         const Twine &Name,
                                         BasicBlock *InsertAtEnd) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2045,7 +2323,7 @@ CastInst *CastInst::CreateSExtOrBitCast(Value *S, const Type *Ty,
   return Create(Instruction::SExt, S, Ty, Name, InsertAtEnd);
 }
 
-CastInst *CastInst::CreateTruncOrBitCast(Value *S, const Type *Ty,
+CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty,
                                          const Twine &Name,
                                          Instruction *InsertBefore) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2053,7 +2331,7 @@ CastInst *CastInst::CreateTruncOrBitCast(Value *S, const Type *Ty,
   return Create(Instruction::Trunc, S, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateTruncOrBitCast(Value *S, const Type *Ty,
+CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty,
                                          const Twine &Name, 
                                          BasicBlock *InsertAtEnd) {
   if (S->getType()->getScalarSizeInBits() == Ty->getScalarSizeInBits())
@@ -2061,7 +2339,7 @@ CastInst *CastInst::CreateTruncOrBitCast(Value *S, const Type *Ty,
   return Create(Instruction::Trunc, S, Ty, Name, InsertAtEnd);
 }
 
-CastInst *CastInst::CreatePointerCast(Value *S, const Type *Ty,
+CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
                                       const Twine &Name,
                                       BasicBlock *InsertAtEnd) {
   assert(S->getType()->isPointerTy() && "Invalid cast");
@@ -2074,7 +2352,7 @@ CastInst *CastInst::CreatePointerCast(Value *S, const Type *Ty,
 }
 
 /// @brief Create a BitCast or a PtrToInt cast instruction
-CastInst *CastInst::CreatePointerCast(Value *S, const Type *Ty, 
+CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, 
                                       const Twine &Name, 
                                       Instruction *InsertBefore) {
   assert(S->getType()->isPointerTy() && "Invalid cast");
@@ -2086,7 +2364,7 @@ CastInst *CastInst::CreatePointerCast(Value *S, const Type *Ty,
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateIntegerCast(Value *C, const Type *Ty, 
+CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, 
                                       bool isSigned, const Twine &Name,
                                       Instruction *InsertBefore) {
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
@@ -2100,7 +2378,7 @@ CastInst *CastInst::CreateIntegerCast(Value *C, const Type *Ty,
   return Create(opcode, C, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateIntegerCast(Value *C, const Type *Ty, 
+CastInst *CastInst::CreateIntegerCast(Value *C, Type *Ty, 
                                       bool isSigned, const Twine &Name,
                                       BasicBlock *InsertAtEnd) {
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isIntOrIntVectorTy() &&
@@ -2114,7 +2392,7 @@ CastInst *CastInst::CreateIntegerCast(Value *C, const Type *Ty,
   return Create(opcode, C, Ty, Name, InsertAtEnd);
 }
 
-CastInst *CastInst::CreateFPCast(Value *C, const Type *Ty, 
+CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, 
                                  const Twine &Name, 
                                  Instruction *InsertBefore) {
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
@@ -2127,7 +2405,7 @@ CastInst *CastInst::CreateFPCast(Value *C, const Type *Ty,
   return Create(opcode, C, Ty, Name, InsertBefore);
 }
 
-CastInst *CastInst::CreateFPCast(Value *C, const Type *Ty, 
+CastInst *CastInst::CreateFPCast(Value *C, Type *Ty, 
                                  const Twine &Name, 
                                  BasicBlock *InsertAtEnd) {
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
@@ -2142,15 +2420,15 @@ CastInst *CastInst::CreateFPCast(Value *C, const Type *Ty,
 
 // Check whether it is valid to call getCastOpcode for these types.
 // This routine must be kept in sync with getCastOpcode.
-bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) {
+bool CastInst::isCastable(Type *SrcTy, Type *DestTy) {
   if (!SrcTy->isFirstClassType() || !DestTy->isFirstClassType())
     return false;
 
   if (SrcTy == DestTy)
     return true;
 
-  if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
-    if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+  if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
       if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
         // An element by element cast.  Valid if casting the elements is valid.
         SrcTy = SrcVecTy->getElementType();
@@ -2212,8 +2490,8 @@ bool CastInst::isCastable(const Type *SrcTy, const Type *DestTy) {
 // This routine must be kept in sync with isCastable.
 Instruction::CastOps
 CastInst::getCastOpcode(
-  const Value *Src, bool SrcIsSigned, const Type *DestTy, bool DestIsSigned) {
-  const Type *SrcTy = Src->getType();
+  const Value *Src, bool SrcIsSigned, Type *DestTy, bool DestIsSigned) {
+  Type *SrcTy = Src->getType();
 
   assert(SrcTy->isFirstClassType() && DestTy->isFirstClassType() &&
          "Only first class types are castable!");
@@ -2221,8 +2499,8 @@ CastInst::getCastOpcode(
   if (SrcTy == DestTy)
     return BitCast;
 
-  if (const VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
-    if (const VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
+  if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
+    if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
       if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
         // An element by element cast.  Find the appropriate opcode based on the
         // element types.
@@ -2292,17 +2570,17 @@ CastInst::getCastOpcode(
     } else if (SrcTy->isIntegerTy()) {
       return IntToPtr;                              // int -> ptr
     } else {
-      assert(!"Casting pointer to other than pointer or int");
+      assert(0 && "Casting pointer to other than pointer or int");
     }
   } else if (DestTy->isX86_MMXTy()) {
     if (SrcTy->isVectorTy()) {
       assert(DestBits == SrcBits && "Casting vector of wrong width to X86_MMX");
       return BitCast;                               // 64-bit vector to MMX
     } else {
-      assert(!"Illegal cast to X86_MMX");
+      assert(0 && "Illegal cast to X86_MMX");
     }
   } else {
-    assert(!"Casting to type that is not first-class");
+    assert(0 && "Casting to type that is not first-class");
   }
 
   // If we fall through to here we probably hit an assertion cast above
@@ -2320,10 +2598,10 @@ CastInst::getCastOpcode(
 /// it in one place and to eliminate the redundant code for getting the sizes
 /// of the types involved.
 bool 
-CastInst::castIsValid(Instruction::CastOps op, Value *S, const Type *DstTy) {
+CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
 
   // Check for type sanity on the arguments
-  const Type *SrcTy = S->getType();
+  Type *SrcTy = S->getType();
   if (!SrcTy->isFirstClassType() || !DstTy->isFirstClassType() ||
       SrcTy->isAggregateType() || DstTy->isAggregateType())
     return false;
@@ -2384,144 +2662,144 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, const Type *DstTy) {
 }
 
 TruncInst::TruncInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, Trunc, S, Name, InsertBefore) {
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
 }
 
 TruncInst::TruncInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, Trunc, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal Trunc");
 }
 
 ZExtInst::ZExtInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 )  : CastInst(Ty, ZExt, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
 }
 
 ZExtInst::ZExtInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 )  : CastInst(Ty, ZExt, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal ZExt");
 }
 SExtInst::SExtInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, SExt, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
 }
 
 SExtInst::SExtInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 )  : CastInst(Ty, SExt, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SExt");
 }
 
 FPTruncInst::FPTruncInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPTrunc, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
 }
 
 FPTruncInst::FPTruncInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, FPTrunc, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPTrunc");
 }
 
 FPExtInst::FPExtInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPExt, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
 }
 
 FPExtInst::FPExtInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, FPExt, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPExt");
 }
 
 UIToFPInst::UIToFPInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, UIToFP, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
 }
 
 UIToFPInst::UIToFPInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, UIToFP, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal UIToFP");
 }
 
 SIToFPInst::SIToFPInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, SIToFP, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
 }
 
 SIToFPInst::SIToFPInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, SIToFP, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal SIToFP");
 }
 
 FPToUIInst::FPToUIInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPToUI, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
 }
 
 FPToUIInst::FPToUIInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, FPToUI, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToUI");
 }
 
 FPToSIInst::FPToSIInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, FPToSI, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
 }
 
 FPToSIInst::FPToSIInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, FPToSI, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal FPToSI");
 }
 
 PtrToIntInst::PtrToIntInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, PtrToInt, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
 }
 
 PtrToIntInst::PtrToIntInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, PtrToInt, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal PtrToInt");
 }
 
 IntToPtrInst::IntToPtrInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, IntToPtr, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
 }
 
 IntToPtrInst::IntToPtrInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, IntToPtr, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal IntToPtr");
 }
 
 BitCastInst::BitCastInst(
-  Value *S, const Type *Ty, const Twine &Name, Instruction *InsertBefore
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
 ) : CastInst(Ty, BitCast, S, Name, InsertBefore) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
 
 BitCastInst::BitCastInst(
-  Value *S, const Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
 ) : CastInst(Ty, BitCast, S, Name, InsertAtEnd) { 
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
@@ -2532,7 +2810,7 @@ BitCastInst::BitCastInst(
 
 void CmpInst::Anchor() const {}
 
-CmpInst::CmpInst(const Type *ty, OtherOps op, unsigned short predicate,
+CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
                  Value *LHS, Value *RHS, const Twine &Name,
                  Instruction *InsertBefore)
   : Instruction(ty, op,
@@ -2545,7 +2823,7 @@ CmpInst::CmpInst(const Type *ty, OtherOps op, unsigned short predicate,
   setName(Name);
 }
 
-CmpInst::CmpInst(const Type *ty, OtherOps op, unsigned short predicate,
+CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
                  Value *LHS, Value *RHS, const Twine &Name,
                  BasicBlock *InsertAtEnd)
   : Instruction(ty, op,
@@ -2612,7 +2890,7 @@ bool CmpInst::isEquality() const {
 
 CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
   switch (pred) {
-    default: assert(!"Unknown cmp predicate!");
+    default: assert(0 && "Unknown cmp predicate!");
     case ICMP_EQ: return ICMP_NE;
     case ICMP_NE: return ICMP_EQ;
     case ICMP_UGT: return ICMP_ULE;
@@ -2645,7 +2923,7 @@ CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) {
 
 ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
   switch (pred) {
-    default: assert(! "Unknown icmp predicate!");
+    default: assert(0 && "Unknown icmp predicate!");
     case ICMP_EQ: case ICMP_NE: 
     case ICMP_SGT: case ICMP_SLT: case ICMP_SGE: case ICMP_SLE: 
        return pred;
@@ -2658,7 +2936,7 @@ ICmpInst::Predicate ICmpInst::getSignedPredicate(Predicate pred) {
 
 ICmpInst::Predicate ICmpInst::getUnsignedPredicate(Predicate pred) {
   switch (pred) {
-    default: assert(! "Unknown icmp predicate!");
+    default: assert(0 && "Unknown icmp predicate!");
     case ICMP_EQ: case ICMP_NE: 
     case ICMP_UGT: case ICMP_ULT: case ICMP_UGE: case ICMP_ULE: 
        return pred;
@@ -2734,7 +3012,7 @@ ICmpInst::makeConstantRange(Predicate pred, const APInt &C) {
 
 CmpInst::Predicate CmpInst::getSwappedPredicate(Predicate pred) {
   switch (pred) {
-    default: assert(!"Unknown cmp predicate!");
+    default: assert(0 && "Unknown cmp predicate!");
     case ICMP_EQ: case ICMP_NE:
       return pred;
     case ICMP_SGT: return ICMP_SLT;
@@ -3065,14 +3343,34 @@ AllocaInst *AllocaInst::clone_impl() const {
 }
 
 LoadInst *LoadInst::clone_impl() const {
-  return new LoadInst(getOperand(0),
-                      Twine(), isVolatile(),
-                      getAlignment());
+  return new LoadInst(getOperand(0), Twine(), isVolatile(),
+                      getAlignment(), getOrdering(), getSynchScope());
 }
 
 StoreInst *StoreInst::clone_impl() const {
-  return new StoreInst(getOperand(0), getOperand(1),
-                       isVolatile(), getAlignment());
+  return new StoreInst(getOperand(0), getOperand(1), isVolatile(),
+                       getAlignment(), getOrdering(), getSynchScope());
+  
+}
+
+AtomicCmpXchgInst *AtomicCmpXchgInst::clone_impl() const {
+  AtomicCmpXchgInst *Result =
+    new AtomicCmpXchgInst(getOperand(0), getOperand(1), getOperand(2),
+                          getOrdering(), getSynchScope());
+  Result->setVolatile(isVolatile());
+  return Result;
+}
+
+AtomicRMWInst *AtomicRMWInst::clone_impl() const {
+  AtomicRMWInst *Result =
+    new AtomicRMWInst(getOperation(),getOperand(0), getOperand(1),
+                      getOrdering(), getSynchScope());
+  Result->setVolatile(isVolatile());
+  return Result;
+}
+
+FenceInst *FenceInst::clone_impl() const {
+  return new FenceInst(getContext(), getOrdering(), getSynchScope());
 }
 
 TruncInst *TruncInst::clone_impl() const {
@@ -3155,6 +3453,10 @@ PHINode *PHINode::clone_impl() const {
   return new PHINode(*this);
 }
 
+LandingPadInst *LandingPadInst::clone_impl() const {
+  return new LandingPadInst(*this);
+}
+
 ReturnInst *ReturnInst::clone_impl() const {
   return new(getNumOperands()) ReturnInst(*this);
 }
@@ -3176,6 +3478,10 @@ InvokeInst *InvokeInst::clone_impl() const {
   return new(getNumOperands()) InvokeInst(*this);
 }
 
+ResumeInst *ResumeInst::clone_impl() const {
+  return new(1) ResumeInst(*this);
+}
+
 UnwindInst *UnwindInst::clone_impl() const {
   LLVMContext &Context = getContext();
   return new UnwindInst(Context);
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index 06a6f2a..a3f68fe 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -42,8 +42,8 @@ class Value;
 struct DenseMapAPIntKeyInfo {
   struct KeyTy {
     APInt val;
-    const Type* type;
-    KeyTy(const APInt& V, const Type* Ty) : val(V), type(Ty) {}
+    Type* type;
+    KeyTy(const APInt& V, Type* Ty) : val(V), type(Ty) {}
     KeyTy(const KeyTy& that) : val(that.val), type(that.type) {}
     bool operator==(const KeyTy& that) const {
       return type == that.type && this->val == that.val;
diff --git a/lib/VMCore/Makefile b/lib/VMCore/Makefile
index 03a4fc7..2b9b0f2 100644
--- a/lib/VMCore/Makefile
+++ b/lib/VMCore/Makefile
@@ -20,9 +20,9 @@ GENFILE:=$(PROJ_OBJ_ROOT)/include/llvm/Intrinsics.gen
 INTRINSICTD  := $(PROJ_SRC_ROOT)/include/llvm/Intrinsics.td
 INTRINSICTDS := $(wildcard $(PROJ_SRC_ROOT)/include/llvm/Intrinsics*.td)
 
-$(ObjDir)/Intrinsics.gen.tmp: $(ObjDir)/.dir $(INTRINSICTDS) $(TBLGEN)
+$(ObjDir)/Intrinsics.gen.tmp: $(ObjDir)/.dir $(INTRINSICTDS) $(LLVM_TBLGEN)
 	$(Echo) Building Intrinsics.gen.tmp from Intrinsics.td
-	$(Verb) $(TableGen) $(call SYSPATH, $(INTRINSICTD)) -o $(call SYSPATH, $@) -gen-intrinsic
+	$(Verb) $(LLVMTableGen) $(call SYSPATH, $(INTRINSICTD)) -o $(call SYSPATH, $@) -gen-intrinsic
 
 $(GENFILE): $(ObjDir)/Intrinsics.gen.tmp
 	$(Verb) $(CMP) -s $@ $< || ( $(CP) $< $@ && \
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index be2fcb8..c29029b 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -32,25 +32,10 @@ using namespace llvm;
 // Methods to implement the globals and functions lists.
 //
 
-GlobalVariable *ilist_traits<GlobalVariable>::createSentinel() {
-  GlobalVariable *Ret = new GlobalVariable(Type::getInt32Ty(getGlobalContext()),
-                                           false, GlobalValue::ExternalLinkage);
-  // This should not be garbage monitored.
-  LeakDetector::removeGarbageObject(Ret);
-  return Ret;
-}
-GlobalAlias *ilist_traits<GlobalAlias>::createSentinel() {
-  GlobalAlias *Ret = new GlobalAlias(Type::getInt32Ty(getGlobalContext()),
-                                     GlobalValue::ExternalLinkage);
-  // This should not be garbage monitored.
-  LeakDetector::removeGarbageObject(Ret);
-  return Ret;
-}
-
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file.
-template class llvm::SymbolTableListTraits<GlobalVariable, Module>;
 template class llvm::SymbolTableListTraits<Function, Module>;
+template class llvm::SymbolTableListTraits<GlobalVariable, Module>;
 template class llvm::SymbolTableListTraits<GlobalAlias, Module>;
 
 //===----------------------------------------------------------------------===//
@@ -82,8 +67,10 @@ Module::Endianness Module::getEndianness() const {
   Module::Endianness ret = AnyEndianness;
   
   while (!temp.empty()) {
-    StringRef token = DataLayout;
-    tie(token, temp) = getToken(temp, "-");
+    std::pair<StringRef, StringRef> P = getToken(temp, "-");
+    
+    StringRef token = P.first;
+    temp = P.second;
     
     if (token[0] == 'e') {
       ret = LittleEndian;
@@ -95,15 +82,16 @@ Module::Endianness Module::getEndianness() const {
   return ret;
 }
 
-/// Target Pointer Size information...
+/// Target Pointer Size information.
 Module::PointerSize Module::getPointerSize() const {
   StringRef temp = DataLayout;
   Module::PointerSize ret = AnyPointerSize;
   
   while (!temp.empty()) {
-    StringRef token, signalToken;
-    tie(token, temp) = getToken(temp, "-");
-    tie(signalToken, token) = getToken(token, ":");
+    std::pair<StringRef, StringRef> TmpP = getToken(temp, "-");
+    temp = TmpP.second;
+    TmpP = getToken(TmpP.first, ":");
+    StringRef token = TmpP.second, signalToken = TmpP.first;
     
     if (signalToken[0] == 'p') {
       int size = 0;
@@ -149,7 +137,7 @@ void Module::getMDKindNames(SmallVectorImpl<StringRef> &Result) const {
 // the symbol table directly for this common task.
 //
 Constant *Module::getOrInsertFunction(StringRef Name,
-                                      const FunctionType *Ty,
+                                      FunctionType *Ty,
                                       AttrListPtr AttributeList) {
   // See if we have a definition for the specified function already.
   GlobalValue *F = getNamedValue(Name);
@@ -182,7 +170,7 @@ Constant *Module::getOrInsertFunction(StringRef Name,
 }
 
 Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
-                                             const FunctionType *Ty,
+                                             FunctionType *Ty,
                                              AttrListPtr AttributeList) {
   // See if we have a definition for the specified function already.
   GlobalValue *F = getNamedValue(Name);
@@ -199,7 +187,7 @@ Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
 }
 
 Constant *Module::getOrInsertFunction(StringRef Name,
-                                      const FunctionType *Ty) {
+                                      FunctionType *Ty) {
   AttrListPtr AttributeList = AttrListPtr::get((AttributeWithIndex *)0, 0);
   return getOrInsertFunction(Name, Ty, AttributeList);
 }
@@ -211,7 +199,7 @@ Constant *Module::getOrInsertFunction(StringRef Name,
 //
 Constant *Module::getOrInsertFunction(StringRef Name,
                                       AttrListPtr AttributeList,
-                                      const Type *RetTy, ...) {
+                                      Type *RetTy, ...) {
   va_list Args;
   va_start(Args, RetTy);
 
@@ -229,7 +217,7 @@ Constant *Module::getOrInsertFunction(StringRef Name,
 }
 
 Constant *Module::getOrInsertFunction(StringRef Name,
-                                      const Type *RetTy, ...) {
+                                      Type *RetTy, ...) {
   va_list Args;
   va_start(Args, RetTy);
 
@@ -279,7 +267,7 @@ GlobalVariable *Module::getGlobalVariable(StringRef Name,
 ///      with a constantexpr cast to the right type.
 ///   3. Finally, if the existing global is the correct delclaration, return the
 ///      existing global.
-Constant *Module::getOrInsertGlobal(StringRef Name, const Type *Ty) {
+Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
   // See if we have a definition for the specified global already.
   GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(getNamedValue(Name));
   if (GV == 0) {
@@ -436,7 +424,7 @@ namespace {
     // To avoid walking constant expressions multiple times and other IR
     // objects, we keep several helper maps.
     DenseSet<const Value*> VisitedConstants;
-    DenseSet<const Type*> VisitedTypes;
+    DenseSet<Type*> VisitedTypes;
     
     std::vector<StructType*> &StructTypes;
   public:
@@ -549,5 +537,3 @@ namespace {
 void Module::findUsedStructTypes(std::vector<StructType*> &StructTypes) const {
   TypeFinder(StructTypes).run(*this);
 }
-
-
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 5cf2905..ecedb1d 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/Mutex.h"
 #include "llvm/ADT/StringMap.h"
 #include <algorithm>
-#include <cstdio>
 #include <map>
 using namespace llvm;
 
@@ -167,8 +166,8 @@ class BBPassManager : public PMDataManager, public FunctionPass {
 
 public:
   static char ID;
-  explicit BBPassManager(int Depth)
-    : PMDataManager(Depth), FunctionPass(ID) {}
+  explicit BBPassManager()
+    : PMDataManager(), FunctionPass(ID) {}
 
   /// Execute all of the passes scheduled for execution.  Keep track of
   /// whether any of the passes modifies the function, and if so, return true.
@@ -193,7 +192,7 @@ public:
 
   // Print passes managed by this manager
   void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs() << std::string(Offset*2, ' ') << "BasicBlockPass Manager\n";
+    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       BasicBlockPass *BP = getContainedPass(Index);
       BP->dumpPassStructure(Offset + 1);
@@ -228,9 +227,9 @@ private:
   bool wasRun;
 public:
   static char ID;
-  explicit FunctionPassManagerImpl(int Depth) :
-    Pass(PT_PassManager, ID), PMDataManager(Depth),
-    PMTopLevelManager(new FPPassManager(1)), wasRun(false) {}
+  explicit FunctionPassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
 
   /// add - Add a pass to the queue of passes to run.  This passes ownership of
   /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
@@ -303,8 +302,8 @@ char FunctionPassManagerImpl::ID = 0;
 class MPPassManager : public Pass, public PMDataManager {
 public:
   static char ID;
-  explicit MPPassManager(int Depth) :
-    Pass(PT_PassManager, ID), PMDataManager(Depth) { }
+  explicit MPPassManager() :
+    Pass(PT_PassManager, ID), PMDataManager() { }
 
   // Delete on the fly managers.
   virtual ~MPPassManager() {
@@ -349,7 +348,7 @@ public:
 
   // Print passes managed by this manager
   void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs() << std::string(Offset*2, ' ') << "ModulePass Manager\n";
+    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       ModulePass *MP = getContainedPass(Index);
       MP->dumpPassStructure(Offset + 1);
@@ -388,9 +387,9 @@ class PassManagerImpl : public Pass,
 
 public:
   static char ID;
-  explicit PassManagerImpl(int Depth) :
-    Pass(PT_PassManager, ID), PMDataManager(Depth),
-                              PMTopLevelManager(new MPPassManager(1)) {}
+  explicit PassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+                              PMTopLevelManager(new MPPassManager()) {}
 
   /// add - Add a pass to the queue of passes to run.  This passes ownership of
   /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
@@ -1340,7 +1339,7 @@ bool BBPassManager::doFinalization(Function &F) {
 
 /// Create new Function pass manager
 FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
-  FPM = new FunctionPassManagerImpl(0);
+  FPM = new FunctionPassManagerImpl();
   // FPM is the top level manager.
   FPM->setTopLevelManager(FPM);
 
@@ -1532,7 +1531,7 @@ bool FPPassManager::runOnModule(Module &M) {
   bool Changed = doInitialization(M);
 
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    runOnFunction(*I);
+    Changed |= runOnFunction(*I);
 
   return doFinalization(M) || Changed;
 }
@@ -1626,7 +1625,7 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
 
   FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
   if (!FPP) {
-    FPP = new FunctionPassManagerImpl(0);
+    FPP = new FunctionPassManagerImpl();
     // FPP is the top level manager.
     FPP->setTopLevelManager(FPP);
 
@@ -1635,9 +1634,11 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
   FPP->add(RequiredPass);
 
   // Register P as the last user of RequiredPass.
-  SmallVector<Pass *, 1> LU;
-  LU.push_back(RequiredPass);
-  FPP->setLastUser(LU,  P);
+  if (RequiredPass) {
+    SmallVector<Pass *, 1> LU;
+    LU.push_back(RequiredPass);
+    FPP->setLastUser(LU,  P);
+  }
 }
 
 /// Return function pass corresponding to PassInfo PI, that is
@@ -1677,7 +1678,7 @@ bool PassManagerImpl::run(Module &M) {
 
 /// Create new pass manager
 PassManager::PassManager() {
-  PM = new PassManagerImpl(0);
+  PM = new PassManagerImpl();
   // PM is the top level manager
   PM->setTopLevelManager(PM);
 }
@@ -1761,13 +1762,23 @@ void PMStack::pop() {
 // Push PM on the stack and set its top level manager.
 void PMStack::push(PMDataManager *PM) {
   assert(PM && "Unable to push. Pass Manager expected");
+  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
 
   if (!this->empty()) {
+    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
+           && "pushing bad pass manager to PMStack");
     PMTopLevelManager *TPM = this->top()->getTopLevelManager();
 
     assert(TPM && "Unable to find top level manager");
     TPM->addIndirectPassManager(PM);
     PM->setTopLevelManager(TPM);
+    PM->setDepth(this->top()->getDepth()+1);
+  }
+  else {
+    assert((PM->getPassManagerType() == PMT_ModulePassManager
+           || PM->getPassManagerType() == PMT_FunctionPassManager)
+           && "pushing bad pass manager to PMStack");
+    PM->setDepth(1);
   }
 
   S.push_back(PM);
@@ -1777,10 +1788,10 @@ void PMStack::push(PMDataManager *PM) {
 void PMStack::dump() const {
   for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
          E = S.end(); I != E; ++I)
-    printf("%s ", (*I)->getAsPass()->getPassName());
+    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
 
   if (!S.empty())
-    printf("\n");
+    dbgs() << '\n';
 }
 
 /// Find appropriate Module Pass Manager in the PM Stack and
@@ -1823,7 +1834,7 @@ void FunctionPass::assignPassManager(PMStack &PMS,
     PMDataManager *PMD = PMS.top();
 
     // [1] Create new Function Pass Manager
-    FPP = new FPPassManager(PMD->getDepth() + 1);
+    FPP = new FPPassManager();
     FPP->populateInheritedAnalysis(PMS);
 
     // [2] Set up new manager's top level manager
@@ -1860,7 +1871,7 @@ void BasicBlockPass::assignPassManager(PMStack &PMS,
     PMDataManager *PMD = PMS.top();
 
     // [1] Create new Basic Block Manager
-    BBP = new BBPassManager(PMD->getDepth() + 1);
+    BBP = new BBPassManager();
 
     // [2] Set up new manager's top level manager
     // Basic Block Pass Manager does not live by itself
diff --git a/lib/VMCore/PassRegistry.cpp b/lib/VMCore/PassRegistry.cpp
index fa92620..2df6557 100644
--- a/lib/VMCore/PassRegistry.cpp
+++ b/lib/VMCore/PassRegistry.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Function.h"
 #include <vector>
 
 using namespace llvm;
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index f874d1b..10184bc 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -40,8 +40,8 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
 
 /// getScalarType - If this is a vector type, return the element type,
 /// otherwise return this.
-const Type *Type::getScalarType() const {
-  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+Type *Type::getScalarType() {
+  if (VectorType *VTy = dyn_cast<VectorType>(this))
     return VTy->getElementType();
   return this;
 }
@@ -77,7 +77,7 @@ bool Type::isFPOrFPVectorTy() const {
 // canLosslesslyBitCastTo - Return true if this type can be converted to
 // 'Ty' without any reinterpretation of bits.  For example, i8* to i32*.
 //
-bool Type::canLosslesslyBitCastTo(const Type *Ty) const {
+bool Type::canLosslesslyBitCastTo(Type *Ty) const {
   // Identity cast means no change so return true
   if (this == Ty) 
     return true;
@@ -146,7 +146,7 @@ unsigned Type::getPrimitiveSizeInBits() const {
 /// getScalarSizeInBits - If this is a vector type, return the
 /// getPrimitiveSizeInBits value for the element type. Otherwise return the
 /// getPrimitiveSizeInBits value for this type.
-unsigned Type::getScalarSizeInBits() const {
+unsigned Type::getScalarSizeInBits() {
   return getScalarType()->getPrimitiveSizeInBits();
 }
 
@@ -306,7 +306,7 @@ APInt IntegerType::getMask() const {
 //                       FunctionType Implementation
 //===----------------------------------------------------------------------===//
 
-FunctionType::FunctionType(const Type *Result, ArrayRef<Type*> Params,
+FunctionType::FunctionType(Type *Result, ArrayRef<Type*> Params,
                            bool IsVarArgs)
   : Type(Result->getContext(), FunctionTyID) {
   Type **SubTys = reinterpret_cast<Type**>(this+1);
@@ -326,7 +326,7 @@ FunctionType::FunctionType(const Type *Result, ArrayRef<Type*> Params,
 }
 
 // FunctionType::get - The factory function for the FunctionType class.
-FunctionType *FunctionType::get(const Type *ReturnType,
+FunctionType *FunctionType::get(Type *ReturnType,
                                 ArrayRef<Type*> Params, bool isVarArg) {
   // TODO: This is brutally slow.
   std::vector<Type*> Key;
@@ -351,21 +351,21 @@ FunctionType *FunctionType::get(const Type *ReturnType,
 }
 
 
-FunctionType *FunctionType::get(const Type *Result, bool isVarArg) {
+FunctionType *FunctionType::get(Type *Result, bool isVarArg) {
   return get(Result, ArrayRef<Type *>(), isVarArg);
 }
 
 
 /// isValidReturnType - Return true if the specified type is valid as a return
 /// type.
-bool FunctionType::isValidReturnType(const Type *RetTy) {
+bool FunctionType::isValidReturnType(Type *RetTy) {
   return !RetTy->isFunctionTy() && !RetTy->isLabelTy() &&
   !RetTy->isMetadataTy();
 }
 
 /// isValidArgumentType - Return true if the specified type is valid as an
 /// argument type.
-bool FunctionType::isValidArgumentType(const Type *ArgTy) {
+bool FunctionType::isValidArgumentType(Type *ArgTy) {
   return ArgTy->isFirstClassType();
 }
 
@@ -392,7 +392,7 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
   
   // Value not found.  Create a new type!
   ST = new (Context.pImpl->TypeAllocator) StructType(Context);
-  ST->setSubclassData(SCDB_IsAnonymous);  // Anonymous struct.
+  ST->setSubclassData(SCDB_IsLiteral);  // Literal struct.
   ST->setBody(ETypes, isPacked);
   return ST;
 }
@@ -412,13 +412,6 @@ void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
   NumContainedTys = Elements.size();
 }
 
-StructType *StructType::createNamed(LLVMContext &Context, StringRef Name) {
-  StructType *ST = new (Context.pImpl->TypeAllocator) StructType(Context);
-  if (!Name.empty())
-    ST->setName(Name);
-  return ST;
-}
-
 void StructType::setName(StringRef Name) {
   if (Name == getName()) return;
 
@@ -461,6 +454,13 @@ void StructType::setName(StringRef Name) {
 //===----------------------------------------------------------------------===//
 // StructType Helper functions.
 
+StructType *StructType::create(LLVMContext &Context, StringRef Name) {
+  StructType *ST = new (Context.pImpl->TypeAllocator) StructType(Context);
+  if (!Name.empty())
+    ST->setName(Name);
+  return ST;
+}
+
 StructType *StructType::get(LLVMContext &Context, bool isPacked) {
   return get(Context, llvm::ArrayRef<Type*>(), isPacked);
 }
@@ -478,21 +478,36 @@ StructType *StructType::get(Type *type, ...) {
   return llvm::StructType::get(Ctx, StructFields);
 }
 
-StructType *StructType::createNamed(LLVMContext &Context, StringRef Name,
-                                    ArrayRef<Type*> Elements, bool isPacked) {
-  StructType *ST = createNamed(Context, Name);
+StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements,
+                               StringRef Name, bool isPacked) {
+  StructType *ST = create(Context, Name);
   ST->setBody(Elements, isPacked);
   return ST;
 }
 
-StructType *StructType::createNamed(StringRef Name, ArrayRef<Type*> Elements,
-                                    bool isPacked) {
+StructType *StructType::create(LLVMContext &Context, ArrayRef<Type*> Elements) {
+  return create(Context, Elements, StringRef());
+}
+
+StructType *StructType::create(LLVMContext &Context) {
+  return create(Context, StringRef());
+}
+
+
+StructType *StructType::create(ArrayRef<Type*> Elements, StringRef Name,
+                               bool isPacked) {
+  assert(!Elements.empty() &&
+         "This method may not be invoked with an empty list");
+  return create(Elements[0]->getContext(), Elements, Name, isPacked);
+}
+
+StructType *StructType::create(ArrayRef<Type*> Elements) {
   assert(!Elements.empty() &&
          "This method may not be invoked with an empty list");
-  return createNamed(Elements[0]->getContext(), Name, Elements, isPacked);
+  return create(Elements[0]->getContext(), Elements, StringRef());
 }
 
-StructType *StructType::createNamed(StringRef Name, Type *type, ...) {
+StructType *StructType::create(StringRef Name, Type *type, ...) {
   assert(type != 0 && "Cannot create a struct type with no elements with this");
   LLVMContext &Ctx = type->getContext();
   va_list ap;
@@ -502,11 +517,12 @@ StructType *StructType::createNamed(StringRef Name, Type *type, ...) {
     StructFields.push_back(type);
     type = va_arg(ap, llvm::Type*);
   }
-  return llvm::StructType::createNamed(Ctx, Name, StructFields);
+  return llvm::StructType::create(Ctx, StructFields, Name);
 }
 
+
 StringRef StructType::getName() const {
-  assert(!isAnonymous() && "Anonymous structs never have names");
+  assert(!isLiteral() && "Literal structs never have names");
   if (SymbolTableEntry == 0) return StringRef();
   
   return ((StringMapEntry<StructType*> *)SymbolTableEntry)->getKey();
@@ -524,14 +540,14 @@ void StructType::setBody(Type *type, ...) {
   setBody(StructFields);
 }
 
-bool StructType::isValidElementType(const Type *ElemTy) {
+bool StructType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
 }
 
 /// isLayoutIdentical - Return true if this is layout identical to the
 /// specified struct.
-bool StructType::isLayoutIdentical(const StructType *Other) const {
+bool StructType::isLayoutIdentical(StructType *Other) const {
   if (this == Other) return true;
   
   if (isPacked() != Other->isPacked() ||
@@ -557,8 +573,8 @@ StructType *Module::getTypeByName(StringRef Name) const {
 //                       CompositeType Implementation
 //===----------------------------------------------------------------------===//
 
-Type *CompositeType::getTypeAtIndex(const Value *V) const {
-  if (const StructType *STy = dyn_cast<StructType>(this)) {
+Type *CompositeType::getTypeAtIndex(const Value *V) {
+  if (StructType *STy = dyn_cast<StructType>(this)) {
     unsigned Idx = (unsigned)cast<ConstantInt>(V)->getZExtValue();
     assert(indexValid(Idx) && "Invalid structure index!");
     return STy->getElementType(Idx);
@@ -566,8 +582,8 @@ Type *CompositeType::getTypeAtIndex(const Value *V) const {
   
   return cast<SequentialType>(this)->getElementType();
 }
-Type *CompositeType::getTypeAtIndex(unsigned Idx) const {
-  if (const StructType *STy = dyn_cast<StructType>(this)) {
+Type *CompositeType::getTypeAtIndex(unsigned Idx) {
+  if (StructType *STy = dyn_cast<StructType>(this)) {
     assert(indexValid(Idx) && "Invalid structure index!");
     return STy->getElementType(Idx);
   }
@@ -605,7 +621,7 @@ ArrayType::ArrayType(Type *ElType, uint64_t NumEl)
 }
 
 
-ArrayType *ArrayType::get(const Type *elementType, uint64_t NumElements) {
+ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) {
   Type *ElementType = const_cast<Type*>(elementType);
   assert(isValidElementType(ElementType) && "Invalid type for array element!");
     
@@ -618,7 +634,7 @@ ArrayType *ArrayType::get(const Type *elementType, uint64_t NumElements) {
   return Entry;
 }
 
-bool ArrayType::isValidElementType(const Type *ElemTy) {
+bool ArrayType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy();
 }
@@ -632,7 +648,7 @@ VectorType::VectorType(Type *ElType, unsigned NumEl)
   NumElements = NumEl;
 }
 
-VectorType *VectorType::get(const Type *elementType, unsigned NumElements) {
+VectorType *VectorType::get(Type *elementType, unsigned NumElements) {
   Type *ElementType = const_cast<Type*>(elementType);
   assert(NumElements > 0 && "#Elements of a VectorType must be greater than 0");
   assert(isValidElementType(ElementType) &&
@@ -647,7 +663,7 @@ VectorType *VectorType::get(const Type *elementType, unsigned NumElements) {
   return Entry;
 }
 
-bool VectorType::isValidElementType(const Type *ElemTy) {
+bool VectorType::isValidElementType(Type *ElemTy) {
   return ElemTy->isIntegerTy() || ElemTy->isFloatingPointTy();
 }
 
@@ -655,8 +671,7 @@ bool VectorType::isValidElementType(const Type *ElemTy) {
 //                         PointerType Implementation
 //===----------------------------------------------------------------------===//
 
-PointerType *PointerType::get(const Type *eltTy, unsigned AddressSpace) {
-  Type *EltTy = const_cast<Type*>(eltTy);
+PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) {
   assert(EltTy && "Can't get a pointer to <null> type!");
   assert(isValidElementType(EltTy) && "Invalid type for pointer element!");
   
@@ -677,11 +692,11 @@ PointerType::PointerType(Type *E, unsigned AddrSpace)
   setSubclassData(AddrSpace);
 }
 
-PointerType *Type::getPointerTo(unsigned addrs) const {
+PointerType *Type::getPointerTo(unsigned addrs) {
   return PointerType::get(this, addrs);
 }
 
-bool PointerType::isValidElementType(const Type *ElemTy) {
+bool PointerType::isValidElementType(Type *ElemTy) {
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy();
 }
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index f1815e3..2fa5f08 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -35,12 +35,12 @@ using namespace llvm;
 //                                Value Class
 //===----------------------------------------------------------------------===//
 
-static inline Type *checkType(const Type *Ty) {
+static inline Type *checkType(Type *Ty) {
   assert(Ty && "Value defined with a null type: Error!");
   return const_cast<Type*>(Ty);
 }
 
-Value::Value(const Type *ty, unsigned scid)
+Value::Value(Type *ty, unsigned scid)
   : SubclassID(scid), HasValueHandle(0),
     SubclassOptionalData(0), SubclassData(0), VTy((Type*)checkType(ty)),
     UseList(0), Name(0) {
@@ -369,7 +369,7 @@ bool Value::isDereferenceablePointer() const {
     for (User::const_op_iterator I = GEP->op_begin()+1,
          E = GEP->op_end(); I != E; ++I) {
       Value *Index = *I;
-      const Type *Ty = *GTI++;
+      Type *Ty = *GTI++;
       // Struct indices can't be out of bounds.
       if (isa<StructType>(Ty))
         continue;
@@ -380,7 +380,7 @@ bool Value::isDereferenceablePointer() const {
       if (CI->isZero())
         continue;
       // Check to see that it's within the bounds of an array.
-      const ArrayType *ATy = dyn_cast<ArrayType>(Ty);
+      ArrayType *ATy = dyn_cast<ArrayType>(Ty);
       if (!ATy)
         return false;
       if (CI->getValue().getActiveBits() > 64)
diff --git a/lib/VMCore/ValueTypes.cpp b/lib/VMCore/ValueTypes.cpp
index 21a1f03..e13bd7d 100644
--- a/lib/VMCore/ValueTypes.cpp
+++ b/lib/VMCore/ValueTypes.cpp
@@ -19,6 +19,12 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+EVT EVT::changeExtendedVectorElementTypeToInteger() const {
+  LLVMContext &Context = LLVMTy->getContext();
+  EVT IntTy = getIntegerVT(Context, getVectorElementType().getSizeInBits());
+  return getVectorVT(Context, IntTy, getVectorNumElements());
+}
+
 EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) {
   EVT VT;
   VT.LLVMTy = IntegerType::get(Context, BitWidth);
@@ -77,9 +83,9 @@ unsigned EVT::getExtendedVectorNumElements() const {
 
 unsigned EVT::getExtendedSizeInBits() const {
   assert(isExtended() && "Type is not extended!");
-  if (const IntegerType *ITy = dyn_cast<IntegerType>(LLVMTy))
+  if (IntegerType *ITy = dyn_cast<IntegerType>(LLVMTy))
     return ITy->getBitWidth();
-  if (const VectorType *VTy = dyn_cast<VectorType>(LLVMTy))
+  if (VectorType *VTy = dyn_cast<VectorType>(LLVMTy))
     return VTy->getBitWidth();
   assert(false && "Unrecognized extended type!");
   return 0; // Suppress warnings.
@@ -140,7 +146,7 @@ std::string EVT::getEVTString() const {
 /// getTypeForEVT - This method returns an LLVM type corresponding to the
 /// specified EVT.  For integer types, this returns an unsigned type.  Note
 /// that this will abort for types that cannot be represented.
-const Type *EVT::getTypeForEVT(LLVMContext &Context) const {
+Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   switch (V.SimpleTy) {
   default:
     assert(isExtended() && "Type is not extended!");
@@ -186,7 +192,7 @@ const Type *EVT::getTypeForEVT(LLVMContext &Context) const {
 /// getEVT - Return the value type corresponding to the specified type.  This
 /// returns all pointers as MVT::iPTR.  If HandleUnknown is true, unknown types
 /// are returned as Other, otherwise they are invalid.
-EVT EVT::getEVT(const Type *Ty, bool HandleUnknown){
+EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
   switch (Ty->getTypeID()) {
   default:
     if (HandleUnknown) return MVT(MVT::Other);
@@ -204,7 +210,7 @@ EVT EVT::getEVT(const Type *Ty, bool HandleUnknown){
   case Type::PPC_FP128TyID: return MVT(MVT::ppcf128);
   case Type::PointerTyID:   return MVT(MVT::iPTR);
   case Type::VectorTyID: {
-    const VectorType *VTy = cast<VectorType>(Ty);
+    VectorType *VTy = cast<VectorType>(Ty);
     return getVectorVT(Ty->getContext(), getEVT(VTy->getElementType(), false),
                        VTy->getNumElements());
   }
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index b146b89..9564b7d 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -35,6 +35,12 @@
 //  * It is illegal to have a ret instruction that returns a value that does not
 //    agree with the function return value type.
 //  * Function call argument types match the function prototype
+//  * A landing pad is defined by a landingpad instruction, and can be jumped to
+//    only by the unwind edge of an invoke instruction.
+//  * A landingpad instruction must be the first non-PHI instruction in the
+//    block.
+//  * All landingpad instructions must use the same personality function with
+//    the same function.
 //  * All other things that are tested by asserts spread about the code...
 //
 //===----------------------------------------------------------------------===//
@@ -131,18 +137,22 @@ namespace {
     /// already.
     SmallPtrSet<MDNode *, 32> MDNodes;
 
+    /// PersonalityFn - The personality function referenced by the
+    /// LandingPadInsts. All LandingPadInsts within the same function must use
+    /// the same personality function.
+    const Value *PersonalityFn;
+
     Verifier()
-      : FunctionPass(ID), 
-      Broken(false), RealPass(true), action(AbortProcessAction),
-      Mod(0), Context(0), DT(0), MessagesStr(Messages) {
-        initializeVerifierPass(*PassRegistry::getPassRegistry());
-      }
+      : FunctionPass(ID), Broken(false), RealPass(true),
+        action(AbortProcessAction), Mod(0), Context(0), DT(0),
+        MessagesStr(Messages), PersonalityFn(0) {
+      initializeVerifierPass(*PassRegistry::getPassRegistry());
+    }
     explicit Verifier(VerifierFailureAction ctn)
-      : FunctionPass(ID), 
-      Broken(false), RealPass(true), action(ctn), Mod(0), Context(0), DT(0),
-      MessagesStr(Messages) {
-        initializeVerifierPass(*PassRegistry::getPassRegistry());
-      }
+      : FunctionPass(ID), Broken(false), RealPass(true), action(ctn), Mod(0),
+        Context(0), DT(0), MessagesStr(Messages), PersonalityFn(0) {
+      initializeVerifierPass(*PassRegistry::getPassRegistry());
+    }
 
     bool doInitialization(Module &M) {
       Mod = &M;
@@ -165,6 +175,7 @@ namespace {
 
       visit(F);
       InstsInThisBlock.clear();
+      PersonalityFn = 0;
 
       // If this is a real pass, in a pass manager, we must abort before
       // returning back to the pass manager, or else the pass manager may try to
@@ -278,18 +289,22 @@ namespace {
     void visitUserOp1(Instruction &I);
     void visitUserOp2(Instruction &I) { visitUserOp1(I); }
     void visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI);
+    void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI);
+    void visitAtomicRMWInst(AtomicRMWInst &RMWI);
+    void visitFenceInst(FenceInst &FI);
     void visitAllocaInst(AllocaInst &AI);
     void visitExtractValueInst(ExtractValueInst &EVI);
     void visitInsertValueInst(InsertValueInst &IVI);
+    void visitLandingPadInst(LandingPadInst &LPI);
 
     void VerifyCallSite(CallSite CS);
-    bool PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
+    bool PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty,
                           int VT, unsigned ArgNo, std::string &Suffix);
     void VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
                                   unsigned RetNum, unsigned ParamNum, ...);
-    void VerifyParameterAttrs(Attributes Attrs, const Type *Ty,
+    void VerifyParameterAttrs(Attributes Attrs, Type *Ty,
                               bool isReturnValue, const Value *V);
-    void VerifyFunctionAttrs(const FunctionType *FT, const AttrListPtr &Attrs,
+    void VerifyFunctionAttrs(FunctionType *FT, const AttrListPtr &Attrs,
                              const Value *V);
 
     void WriteValue(const Value *V) {
@@ -302,7 +317,7 @@ namespace {
       }
     }
 
-    void WriteType(const Type *T) {
+    void WriteType(Type *T) {
       if (!T) return;
       MessagesStr << ' ' << *T;
     }
@@ -323,7 +338,7 @@ namespace {
     }
 
     void CheckFailed(const Twine &Message, const Value *V1,
-                     const Type *T2, const Value *V3 = 0) {
+                     Type *T2, const Value *V3 = 0) {
       MessagesStr << Message.str() << "\n";
       WriteValue(V1);
       WriteType(T2);
@@ -331,8 +346,8 @@ namespace {
       Broken = true;
     }
 
-    void CheckFailed(const Twine &Message, const Type *T1,
-                     const Type *T2 = 0, const Type *T3 = 0) {
+    void CheckFailed(const Twine &Message, Type *T1,
+                     Type *T2 = 0, Type *T3 = 0) {
       MessagesStr << Message.str() << "\n";
       WriteType(T1);
       WriteType(T2);
@@ -421,9 +436,9 @@ void Verifier::visitGlobalVariable(GlobalVariable &GV) {
             "invalid linkage for intrinsic global variable", &GV);
     // Don't worry about emitting an error for it not being an array,
     // visitGlobalValue will complain on appending non-array.
-    if (const ArrayType *ATy = dyn_cast<ArrayType>(GV.getType())) {
-      const StructType *STy = dyn_cast<StructType>(ATy->getElementType());
-      const PointerType *FuncPtrTy =
+    if (ArrayType *ATy = dyn_cast<ArrayType>(GV.getType())) {
+      StructType *STy = dyn_cast<StructType>(ATy->getElementType());
+      PointerType *FuncPtrTy =
           FunctionType::get(Type::getVoidTy(*Context), false)->getPointerTo();
       Assert1(STy && STy->getNumElements() == 2 &&
               STy->getTypeAtIndex(0u)->isIntegerTy(32) &&
@@ -514,7 +529,7 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
 
 // VerifyParameterAttrs - Check the given attributes for an argument or return
 // value of the specified type.  The value V is printed in error messages.
-void Verifier::VerifyParameterAttrs(Attributes Attrs, const Type *Ty,
+void Verifier::VerifyParameterAttrs(Attributes Attrs, Type *Ty,
                                     bool isReturnValue, const Value *V) {
   if (Attrs == Attribute::None)
     return;
@@ -541,7 +556,7 @@ void Verifier::VerifyParameterAttrs(Attributes Attrs, const Type *Ty,
           Attribute::getAsString(TypeI), V);
 
   Attributes ByValI = Attrs & Attribute::ByVal;
-  if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty)) {
     Assert1(!ByValI || PTy->getElementType()->isSized(),
             "Attribute " + Attribute::getAsString(ByValI) +
             " does not support unsized types!", V);
@@ -554,7 +569,7 @@ void Verifier::VerifyParameterAttrs(Attributes Attrs, const Type *Ty,
 
 // VerifyFunctionAttrs - Check parameter attributes against a function type.
 // The value V is printed in error messages.
-void Verifier::VerifyFunctionAttrs(const FunctionType *FT,
+void Verifier::VerifyFunctionAttrs(FunctionType *FT,
                                    const AttrListPtr &Attrs,
                                    const Value *V) {
   if (Attrs.isEmpty())
@@ -565,7 +580,7 @@ void Verifier::VerifyFunctionAttrs(const FunctionType *FT,
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
     const AttributeWithIndex &Attr = Attrs.getSlot(i);
 
-    const Type *Ty;
+    Type *Ty;
     if (Attr.Index == 0)
       Ty = FT->getReturnType();
     else if (Attr.Index-1 < FT->getNumParams())
@@ -615,7 +630,7 @@ static bool VerifyAttributeCount(const AttrListPtr &Attrs, unsigned Params) {
 //
 void Verifier::visitFunction(Function &F) {
   // Check function arguments.
-  const FunctionType *FT = F.getFunctionType();
+  FunctionType *FT = F.getFunctionType();
   unsigned NumArgs = F.arg_size();
 
   Assert1(Context == &F.getContext(),
@@ -795,7 +810,7 @@ void Verifier::visitReturnInst(ReturnInst &RI) {
 void Verifier::visitSwitchInst(SwitchInst &SI) {
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
-  const Type *SwitchTy = SI.getCondition()->getType();
+  Type *SwitchTy = SI.getCondition()->getType();
   SmallPtrSet<ConstantInt*, 32> Constants;
   for (unsigned i = 1, e = SI.getNumCases(); i != e; ++i) {
     Assert1(SI.getCaseValue(i)->getType() == SwitchTy,
@@ -836,8 +851,8 @@ void Verifier::visitUserOp1(Instruction &I) {
 
 void Verifier::visitTruncInst(TruncInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   // Get the size of the types in bits, we'll need this later
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
@@ -854,8 +869,8 @@ void Verifier::visitTruncInst(TruncInst &I) {
 
 void Verifier::visitZExtInst(ZExtInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   // Get the size of the types in bits, we'll need this later
   Assert1(SrcTy->isIntOrIntVectorTy(), "ZExt only operates on integer", &I);
@@ -872,8 +887,8 @@ void Verifier::visitZExtInst(ZExtInst &I) {
 
 void Verifier::visitSExtInst(SExtInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   // Get the size of the types in bits, we'll need this later
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
@@ -890,8 +905,8 @@ void Verifier::visitSExtInst(SExtInst &I) {
 
 void Verifier::visitFPTruncInst(FPTruncInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
   // Get the size of the types in bits, we'll need this later
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
@@ -907,8 +922,8 @@ void Verifier::visitFPTruncInst(FPTruncInst &I) {
 
 void Verifier::visitFPExtInst(FPExtInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   // Get the size of the types in bits, we'll need this later
   unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
@@ -925,8 +940,8 @@ void Verifier::visitFPExtInst(FPExtInst &I) {
 
 void Verifier::visitUIToFPInst(UIToFPInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
@@ -948,8 +963,8 @@ void Verifier::visitUIToFPInst(UIToFPInst &I) {
 
 void Verifier::visitSIToFPInst(SIToFPInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
@@ -971,8 +986,8 @@ void Verifier::visitSIToFPInst(SIToFPInst &I) {
 
 void Verifier::visitFPToUIInst(FPToUIInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
@@ -994,8 +1009,8 @@ void Verifier::visitFPToUIInst(FPToUIInst &I) {
 
 void Verifier::visitFPToSIInst(FPToSIInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   bool SrcVec = SrcTy->isVectorTy();
   bool DstVec = DestTy->isVectorTy();
@@ -1017,8 +1032,8 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) {
 
 void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   Assert1(SrcTy->isPointerTy(), "PtrToInt source must be pointer", &I);
   Assert1(DestTy->isIntegerTy(), "PtrToInt result must be integral", &I);
@@ -1028,8 +1043,8 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
 
 void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   Assert1(SrcTy->isIntegerTy(), "IntToPtr source must be an integral", &I);
   Assert1(DestTy->isPointerTy(), "IntToPtr result must be a pointer",&I);
@@ -1039,8 +1054,8 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
 
 void Verifier::visitBitCastInst(BitCastInst &I) {
   // Get the source and destination types
-  const Type *SrcTy = I.getOperand(0)->getType();
-  const Type *DestTy = I.getType();
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
   // Get the size of the types in bits, we'll need this later
   unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
@@ -1090,11 +1105,11 @@ void Verifier::VerifyCallSite(CallSite CS) {
 
   Assert1(CS.getCalledValue()->getType()->isPointerTy(),
           "Called function must be a pointer!", I);
-  const PointerType *FPTy = cast<PointerType>(CS.getCalledValue()->getType());
+  PointerType *FPTy = cast<PointerType>(CS.getCalledValue()->getType());
 
   Assert1(FPTy->getElementType()->isFunctionTy(),
           "Called function is not pointer to function type!", I);
-  const FunctionType *FTy = cast<FunctionType>(FPTy->getElementType());
+  FunctionType *FTy = cast<FunctionType>(FPTy->getElementType());
 
   // Verify that the correct number of arguments are being passed
   if (FTy->isVarArg())
@@ -1152,6 +1167,12 @@ void Verifier::visitCallInst(CallInst &CI) {
 
 void Verifier::visitInvokeInst(InvokeInst &II) {
   VerifyCallSite(&II);
+
+  // Verify that there is a landingpad instruction as the first non-PHI
+  // instruction of the 'unwind' destination.
+  Assert1(II.getUnwindDest()->isLandingPad(),
+          "The unwind destination does not have a landingpad instruction!",&II);
+
   visitTerminatorInst(II);
 }
 
@@ -1219,8 +1240,8 @@ void Verifier::visitBinaryOperator(BinaryOperator &B) {
 
 void Verifier::visitICmpInst(ICmpInst &IC) {
   // Check that the operands are the same type
-  const Type *Op0Ty = IC.getOperand(0)->getType();
-  const Type *Op1Ty = IC.getOperand(1)->getType();
+  Type *Op0Ty = IC.getOperand(0)->getType();
+  Type *Op1Ty = IC.getOperand(1)->getType();
   Assert1(Op0Ty == Op1Ty,
           "Both operands to ICmp instruction are not of the same type!", &IC);
   // Check that the operands are the right type
@@ -1236,8 +1257,8 @@ void Verifier::visitICmpInst(ICmpInst &IC) {
 
 void Verifier::visitFCmpInst(FCmpInst &FC) {
   // Check that the operands are the same type
-  const Type *Op0Ty = FC.getOperand(0)->getType();
-  const Type *Op1Ty = FC.getOperand(1)->getType();
+  Type *Op0Ty = FC.getOperand(0)->getType();
+  Type *Op1Ty = FC.getOperand(1)->getType();
   Assert1(Op0Ty == Op1Ty,
           "Both operands to FCmp instruction are not of the same type!", &FC);
   // Check that the operands are the right type
@@ -1274,10 +1295,13 @@ void Verifier::visitShuffleVectorInst(ShuffleVectorInst &SV) {
 }
 
 void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  Assert1(cast<PointerType>(GEP.getOperand(0)->getType())
+            ->getElementType()->isSized(),
+          "GEP into unsized type!", &GEP);
+  
   SmallVector<Value*, 16> Idxs(GEP.idx_begin(), GEP.idx_end());
-  const Type *ElTy =
-    GetElementPtrInst::getIndexedType(GEP.getOperand(0)->getType(),
-                                      Idxs.begin(), Idxs.end());
+  Type *ElTy =
+    GetElementPtrInst::getIndexedType(GEP.getOperand(0)->getType(), Idxs);
   Assert1(ElTy, "Invalid indices for GEP pointer type!", &GEP);
   Assert2(GEP.getType()->isPointerTy() &&
           cast<PointerType>(GEP.getType())->getElementType() == ElTy,
@@ -1286,26 +1310,44 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 }
 
 void Verifier::visitLoadInst(LoadInst &LI) {
-  const PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
+  PointerType *PTy = dyn_cast<PointerType>(LI.getOperand(0)->getType());
   Assert1(PTy, "Load operand must be a pointer.", &LI);
-  const Type *ElTy = PTy->getElementType();
+  Type *ElTy = PTy->getElementType();
   Assert2(ElTy == LI.getType(),
           "Load result type does not match pointer operand type!", &LI, ElTy);
+  if (LI.isAtomic()) {
+    Assert1(LI.getOrdering() != Release && LI.getOrdering() != AcquireRelease,
+            "Load cannot have Release ordering", &LI);
+    Assert1(LI.getAlignment() != 0,
+            "Atomic load must specify explicit alignment", &LI);
+  } else {
+    Assert1(LI.getSynchScope() == CrossThread,
+            "Non-atomic load cannot have SynchronizationScope specified", &LI);
+  }
   visitInstruction(LI);
 }
 
 void Verifier::visitStoreInst(StoreInst &SI) {
-  const PointerType *PTy = dyn_cast<PointerType>(SI.getOperand(1)->getType());
+  PointerType *PTy = dyn_cast<PointerType>(SI.getOperand(1)->getType());
   Assert1(PTy, "Store operand must be a pointer.", &SI);
-  const Type *ElTy = PTy->getElementType();
+  Type *ElTy = PTy->getElementType();
   Assert2(ElTy == SI.getOperand(0)->getType(),
           "Stored value type does not match pointer operand type!",
           &SI, ElTy);
+  if (SI.isAtomic()) {
+    Assert1(SI.getOrdering() != Acquire && SI.getOrdering() != AcquireRelease,
+            "Store cannot have Acquire ordering", &SI);
+    Assert1(SI.getAlignment() != 0,
+            "Atomic store must specify explicit alignment", &SI);
+  } else {
+    Assert1(SI.getSynchScope() == CrossThread,
+            "Non-atomic store cannot have SynchronizationScope specified", &SI);
+  }
   visitInstruction(SI);
 }
 
 void Verifier::visitAllocaInst(AllocaInst &AI) {
-  const PointerType *PTy = AI.getType();
+  PointerType *PTy = AI.getType();
   Assert1(PTy->getAddressSpace() == 0, 
           "Allocation instruction pointer not in the generic address space!",
           &AI);
@@ -1316,6 +1358,49 @@ void Verifier::visitAllocaInst(AllocaInst &AI) {
   visitInstruction(AI);
 }
 
+void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
+  Assert1(CXI.getOrdering() != NotAtomic,
+          "cmpxchg instructions must be atomic.", &CXI);
+  Assert1(CXI.getOrdering() != Unordered,
+          "cmpxchg instructions cannot be unordered.", &CXI);
+  PointerType *PTy = dyn_cast<PointerType>(CXI.getOperand(0)->getType());
+  Assert1(PTy, "First cmpxchg operand must be a pointer.", &CXI);
+  Type *ElTy = PTy->getElementType();
+  Assert2(ElTy == CXI.getOperand(1)->getType(),
+          "Expected value type does not match pointer operand type!",
+          &CXI, ElTy);
+  Assert2(ElTy == CXI.getOperand(2)->getType(),
+          "Stored value type does not match pointer operand type!",
+          &CXI, ElTy);
+  visitInstruction(CXI);
+}
+
+void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
+  Assert1(RMWI.getOrdering() != NotAtomic,
+          "atomicrmw instructions must be atomic.", &RMWI);
+  Assert1(RMWI.getOrdering() != Unordered,
+          "atomicrmw instructions cannot be unordered.", &RMWI);
+  PointerType *PTy = dyn_cast<PointerType>(RMWI.getOperand(0)->getType());
+  Assert1(PTy, "First atomicrmw operand must be a pointer.", &RMWI);
+  Type *ElTy = PTy->getElementType();
+  Assert2(ElTy == RMWI.getOperand(1)->getType(),
+          "Argument value type does not match pointer operand type!",
+          &RMWI, ElTy);
+  Assert1(AtomicRMWInst::FIRST_BINOP <= RMWI.getOperation() &&
+          RMWI.getOperation() <= AtomicRMWInst::LAST_BINOP,
+          "Invalid binary operation!", &RMWI);
+  visitInstruction(RMWI);
+}
+
+void Verifier::visitFenceInst(FenceInst &FI) {
+  const AtomicOrdering Ordering = FI.getOrdering();
+  Assert1(Ordering == Acquire || Ordering == Release ||
+          Ordering == AcquireRelease || Ordering == SequentiallyConsistent,
+          "fence instructions may only have "
+          "acquire, release, acq_rel, or seq_cst ordering.", &FI);
+  visitInstruction(FI);
+}
+
 void Verifier::visitExtractValueInst(ExtractValueInst &EVI) {
   Assert1(ExtractValueInst::getIndexedType(EVI.getAggregateOperand()->getType(),
                                            EVI.getIndices()) ==
@@ -1334,6 +1419,55 @@ void Verifier::visitInsertValueInst(InsertValueInst &IVI) {
   visitInstruction(IVI);
 }
 
+void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
+  BasicBlock *BB = LPI.getParent();
+
+  // The landingpad instruction is ill-formed if it doesn't have any clauses and
+  // isn't a cleanup.
+  Assert1(LPI.getNumClauses() > 0 || LPI.isCleanup(),
+          "LandingPadInst needs at least one clause or to be a cleanup.", &LPI);
+
+  // The landingpad instruction defines its parent as a landing pad block. The
+  // landing pad block may be branched to only by the unwind edge of an invoke.
+  for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+    const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator());
+    Assert1(II && II->getUnwindDest() == BB,
+            "Block containing LandingPadInst must be jumped to "
+            "only by the unwind edge of an invoke.", &LPI);
+  }
+
+  // The landingpad instruction must be the first non-PHI instruction in the
+  // block.
+  Assert1(LPI.getParent()->getLandingPadInst() == &LPI,
+          "LandingPadInst not the first non-PHI instruction in the block.",
+          &LPI);
+
+  // The personality functions for all landingpad instructions within the same
+  // function should match.
+  if (PersonalityFn)
+    Assert1(LPI.getPersonalityFn() == PersonalityFn,
+            "Personality function doesn't match others in function", &LPI);
+  PersonalityFn = LPI.getPersonalityFn();
+
+  // All operands must be constants.
+  Assert1(isa<Constant>(PersonalityFn), "Personality function is not constant!",
+          &LPI);
+  for (unsigned i = 0, e = LPI.getNumClauses(); i < e; ++i) {
+    Value *Clause = LPI.getClause(i);
+    Assert1(isa<Constant>(Clause), "Clause is not constant!", &LPI);
+    if (LPI.isCatch(i)) {
+      Assert1(isa<PointerType>(Clause->getType()),
+              "Catch operand does not have pointer type!", &LPI);
+    } else {
+      Assert1(LPI.isFilter(i), "Clause is neither catch nor filter!", &LPI);
+      Assert1(isa<ConstantArray>(Clause) || isa<ConstantAggregateZero>(Clause),
+              "Filter operand is not an array of constants!", &LPI);
+    }
+  }
+
+  visitInstruction(LPI);
+}
+
 /// verifyInstruction - Verify that an instruction is well formed.
 ///
 void Verifier::visitInstruction(Instruction &I) {
@@ -1588,20 +1722,20 @@ static std::string IntrinsicParam(unsigned ArgNo, unsigned NumRets) {
   return "Intrinsic result type #" + utostr(ArgNo);
 }
 
-bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
+bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty,
                                 int VT, unsigned ArgNo, std::string &Suffix) {
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
 
   unsigned NumElts = 0;
-  const Type *EltTy = Ty;
-  const VectorType *VTy = dyn_cast<VectorType>(Ty);
+  Type *EltTy = Ty;
+  VectorType *VTy = dyn_cast<VectorType>(Ty);
   if (VTy) {
     EltTy = VTy->getElementType();
     NumElts = VTy->getNumElements();
   }
 
-  const Type *RetTy = FTy->getReturnType();
-  const StructType *ST = dyn_cast<StructType>(RetTy);
+  Type *RetTy = FTy->getReturnType();
+  StructType *ST = dyn_cast<StructType>(RetTy);
   unsigned NumRetVals;
   if (RetTy->isVoidTy())
     NumRetVals = 0;
@@ -1618,7 +1752,7 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
     // type.
     if ((Match & (ExtendedElementVectorType |
                   TruncatedElementVectorType)) != 0) {
-      const IntegerType *IEltTy = dyn_cast<IntegerType>(EltTy);
+      IntegerType *IEltTy = dyn_cast<IntegerType>(EltTy);
       if (!VTy || !IEltTy) {
         CheckFailed(IntrinsicParam(ArgNo, NumRetVals) + " is not "
                     "an integral vector type.", F);
@@ -1709,7 +1843,7 @@ bool Verifier::PerformTypeCheck(Intrinsic::ID ID, Function *F, const Type *Ty,
     // Outside of TableGen, we don't distinguish iPTRAny (to any address space)
     // and iPTR. In the verifier, we can not distinguish which case we have so
     // allow either case to be legal.
-    if (const PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
+    if (PointerType* PTyp = dyn_cast<PointerType>(Ty)) {
       EVT PointeeVT = EVT::getEVT(PTyp->getElementType(), true);
       if (PointeeVT == MVT::Other) {
         CheckFailed("Intrinsic has pointer to complex type.");
@@ -1757,7 +1891,7 @@ void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
                                         unsigned NumParams, ...) {
   va_list VA;
   va_start(VA, NumParams);
-  const FunctionType *FTy = F->getFunctionType();
+  FunctionType *FTy = F->getFunctionType();
 
   // For overloaded intrinsics, the Suffix of the function name must match the
   // types of the arguments. This variable keeps track of the expected
@@ -1769,8 +1903,8 @@ void Verifier::VerifyIntrinsicPrototype(Intrinsic::ID ID, Function *F,
     return;
   }
 
-  const Type *Ty = FTy->getReturnType();
-  const StructType *ST = dyn_cast<StructType>(Ty);
+  Type *Ty = FTy->getReturnType();
+  StructType *ST = dyn_cast<StructType>(Ty);
 
   if (NumRetVals == 0 && !Ty->isVoidTy()) {
     CheckFailed("Intrinsic should return void", F);
author	dim <dim@FreeBSD.org>	2011-10-20 21:10:27 +0000
committer	dim <dim@FreeBSD.org>	2011-10-20 21:10:27 +0000
commit	7b3392326c40c3c20697816acae597ba7b3144eb (patch)
tree	2cbcf22585e99f8a87d12d5ff94f392c0d266819 /lib
parent	1176aa52646fe641a4243a246aa7f960c708a274 (diff)
download	FreeBSD-src-7b3392326c40c3c20697816acae597ba7b3144eb.zip FreeBSD-src-7b3392326c40c3c20697816acae597ba7b3144eb.tar.gz